[PATCH] bcache: allow allocator to invalidate bucket in gc

linux-bcache.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] bcache: allow allocator to invalidate bucket in gc
@ 2020-09-10 11:21 Dongsheng Yang
  2020-09-10 11:28 ` [PATCH v2] " Dongsheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Dongsheng Yang @ 2020-09-10 11:21 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, Dongsheng Yang

Currently, if the gc is running, when the allocator found free_inc
is empty, allocator has to wait the gc finish. Before that, the
IO is blocked.

But actually, there would be some buckets is reclaimable before gc,
and gc will never mark this kind of bucket to be  unreclaimable.

So we can put these buckets into free_inc in gc running to avoid
IO blocking.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/md/bcache/alloc.c  | 10 ++++------
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 10 +++++++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 52035a7..265fa05 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -130,12 +130,11 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(!ca->set->gc_mark_valid);
-
-	return (!GC_MARK(b) ||
+	return ((b->reclaimable_in_gc || ca->set->gc_mark_valid) &&
+		((!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
-		can_inc_bucket_gen(b);
+		can_inc_bucket_gen(b)));
 }
 
 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -353,8 +352,7 @@ static int bch_allocator_thread(void *arg)
 		 */
 
 retry_invalidate:
-		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       !ca->invalidate_needs_gc);
+		allocator_wait(ca, !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4fd03d2..870f146 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+	uint16_t	reclaimable_in_gc:1;
 };
 
 /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3d8bd06..d45a1dd 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1702,18 +1702,21 @@ static void btree_gc_start(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca) {
 			b->last_gc = b->gen;
+			if (bch_can_invalidate_bucket(ca, b))
+				b->reclaimable_in_gc = 1;
+
 			if (!atomic_read(&b->pin)) {
 				SET_GC_MARK(b, 0);
 				SET_GC_SECTORS_USED(b, 0);
 			}
 		}
 
+	c->gc_mark_valid = 0;
 	mutex_unlock(&c->bucket_lock);
 }
 
@@ -1729,6 +1732,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca)
+			if (b->reclaimable_in_gc)
+				b->reclaimable_in_gc = 0;
+
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2020-09-10 11:21 [PATCH] bcache: allow allocator to invalidate bucket in gc Dongsheng Yang
@ 2020-09-10 11:28 ` Dongsheng Yang
  2020-09-18  9:53   ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Dongsheng Yang @ 2020-09-10 11:28 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, Dongsheng Yang

Currently, if the gc is running, when the allocator found free_inc
is empty, allocator has to wait the gc finish. Before that, the
IO is blocked.

But actually, there would be some buckets is reclaimable before gc,
and gc will never mark this kind of bucket to be  unreclaimable.

So we can put these buckets into free_inc in gc running to avoid
IO being blocked.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/md/bcache/alloc.c  | 11 +++++------
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 10 +++++++++-
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 52035a7..faa5a5d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -130,12 +130,11 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(!ca->set->gc_mark_valid);
-
-	return (!GC_MARK(b) ||
+	return ((b->reclaimable_in_gc || ca->set->gc_mark_valid) &&
+		((!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
-		can_inc_bucket_gen(b);
+		can_inc_bucket_gen(b)));
 }
 
 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -149,6 +148,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
+	b->reclaimable_in_gc = 0;
 }
 
 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -353,8 +353,7 @@ static int bch_allocator_thread(void *arg)
 		 */
 
 retry_invalidate:
-		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       !ca->invalidate_needs_gc);
+		allocator_wait(ca, !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4fd03d2..870f146 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+	uint16_t	reclaimable_in_gc:1;
 };
 
 /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3d8bd06..d45a1dd 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1702,18 +1702,21 @@ static void btree_gc_start(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca) {
 			b->last_gc = b->gen;
+			if (bch_can_invalidate_bucket(ca, b))
+				b->reclaimable_in_gc = 1;
+
 			if (!atomic_read(&b->pin)) {
 				SET_GC_MARK(b, 0);
 				SET_GC_SECTORS_USED(b, 0);
 			}
 		}
 
+	c->gc_mark_valid = 0;
 	mutex_unlock(&c->bucket_lock);
 }
 
@@ -1729,6 +1732,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca)
+			if (b->reclaimable_in_gc)
+				b->reclaimable_in_gc = 0;
+
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2020-09-10 11:28 ` [PATCH v2] " Dongsheng Yang
@ 2020-09-18  9:53   ` Coly Li
  2024-03-15 22:45     ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2020-09-18  9:53 UTC (permalink / raw)
  To: Dongsheng Yang; +Cc: linux-bcache

On 2020/9/10 19:28, Dongsheng Yang wrote:
> Currently, if the gc is running, when the allocator found free_inc
> is empty, allocator has to wait the gc finish. Before that, the
> IO is blocked.
> 
> But actually, there would be some buckets is reclaimable before gc,
> and gc will never mark this kind of bucket to be  unreclaimable.
> 
> So we can put these buckets into free_inc in gc running to avoid
> IO being blocked.
> 
> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>

Hi Dongsheng,

This is not a simple change :-)

Let's do more testing for this patch, and give me more time to
understand the new code path.

Thanks for the idea.

Coly Li


> ---
>  drivers/md/bcache/alloc.c  | 11 +++++------
>  drivers/md/bcache/bcache.h |  1 +
>  drivers/md/bcache/btree.c  | 10 +++++++++-
>  3 files changed, 15 insertions(+), 7 deletions(-)
> 
[snipped]


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2020-09-18  9:53   ` Coly Li
@ 2024-03-15 22:45     ` Robert Pang
  2024-03-16  2:48       ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-03-15 22:45 UTC (permalink / raw)
  To: colyli; +Cc: dongsheng.yang, linux-bcache

Hi all

We found this patch via google.

We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,  

latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
  read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
    slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
    clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
     lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
    clat percentiles (nsec):
     |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
     | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
     | 99.99th=[11328]
   bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
   iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
...

<IO stall>

latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
  read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
    slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
    clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
     lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
    clat percentiles (nsec):
     |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
     | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
     | 99.99th=[10816]
   bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
   iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765

When we track per-second max latency in fio, we see something like this:

<time-ms>,<max-latency-ns>,,,
...
777000, 5155548, 0, 0, 0
778000, 105551, 1, 0, 0
802615, 24276019570, 0, 0, 0
802615, 82134, 1, 0, 0
804000, 9944554, 0, 0, 0
805000, 7424638, 1, 0, 0

fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1

We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.

AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?

We are running Linux kernel version 5.10 and 6.1.

Thank you.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-15 22:45     ` Robert Pang
@ 2024-03-16  2:48       ` Coly Li
  2024-03-17  5:41         ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-03-16  2:48 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, linux-bcache

Hi Robert,

Thanks for your email.

> 2024年3月16日 06:45，Robert Pang <robertpang@google.com> 写道：
> 
> Hi all
> 
> We found this patch via google.
> 
> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,  
> 
> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
>  read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
>    slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
>    clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
>     lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
>    clat percentiles (nsec):
>     |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
>     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
>     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
>     | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
>     | 99.99th=[11328]
>   bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
>   iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> ...
> 
> <IO stall>
> 
> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
>  read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
>    slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
>    clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
>     lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
>    clat percentiles (nsec):
>     |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
>     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
>     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
>     | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
>     | 99.99th=[10816]
>   bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
>   iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> 
> When we track per-second max latency in fio, we see something like this:
> 
> <time-ms>,<max-latency-ns>,,,
> ...
> 777000, 5155548, 0, 0, 0
> 778000, 105551, 1, 0, 0
> 802615, 24276019570, 0, 0, 0
> 802615, 82134, 1, 0, 0
> 804000, 9944554, 0, 0, 0
> 805000, 7424638, 1, 0, 0
> 
> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> 
> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> 
> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> 

Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?

Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
 

Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-16  2:48       ` Coly Li
@ 2024-03-17  5:41         ` Robert Pang
  2024-03-17 13:59           ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-03-17  5:41 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, linux-bcache

Hi Coly

Thank you for looking into this issue.

We tested this patch in 5 machines with local SSD size ranging from
375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
stall nor other issues. Performance was comparable before and after
the patch. Hope this info will be helpful.

Yours
Robert


On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
>
> Hi Robert,
>
> Thanks for your email.
>
> > 2024年3月16日 06:45，Robert Pang <robertpang@google.com> 写道：
> >
> > Hi all
> >
> > We found this patch via google.
> >
> > We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
> >
> > latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
> >  read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
> >    slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
> >    clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
> >     lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
> >    clat percentiles (nsec):
> >     |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
> >     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
> >     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
> >     | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
> >     | 99.99th=[11328]
> >   bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
> >   iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> > ...
> >
> > <IO stall>
> >
> > latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
> >  read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
> >    slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
> >    clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
> >     lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
> >    clat percentiles (nsec):
> >     |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
> >     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
> >     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
> >     | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
> >     | 99.99th=[10816]
> >   bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
> >   iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> >
> > When we track per-second max latency in fio, we see something like this:
> >
> > <time-ms>,<max-latency-ns>,,,
> > ...
> > 777000, 5155548, 0, 0, 0
> > 778000, 105551, 1, 0, 0
> > 802615, 24276019570, 0, 0, 0
> > 802615, 82134, 1, 0, 0
> > 804000, 9944554, 0, 0, 0
> > 805000, 7424638, 1, 0, 0
> >
> > fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> >
> > We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> >
> > AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> >
>
> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
>
> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
>
>
> Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-17  5:41         ` Robert Pang
@ 2024-03-17 13:59           ` Coly Li
  2024-03-18  6:16             ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-03-17 13:59 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, Bcache Linux



> 2024年3月17日 13:41，Robert Pang <robertpang@google.com> 写道：
> 
> Hi Coly
> 

Hi Robert,

> Thank you for looking into this issue.
> 
> We tested this patch in 5 machines with local SSD size ranging from
> 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> stall nor other issues. Performance was comparable before and after
> the patch. Hope this info will be helpful.

Thanks for the information.

Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.

The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.

Coly Li


> 
> 
> On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
>> 
>> Hi Robert,
>> 
>> Thanks for your email.
>> 
>>> 2024年3月16日 06:45，Robert Pang <robertpang@google.com> 写道：
>>> 
>>> Hi all
>>> 
>>> We found this patch via google.
>>> 
>>> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
>>> 
>>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
>>> read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
>>>   slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
>>>   clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
>>>    lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
>>>   clat percentiles (nsec):
>>>    |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
>>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
>>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
>>>    | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
>>>    | 99.99th=[11328]
>>>  bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
>>>  iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
>>> ...
>>> 
>>> <IO stall>
>>> 
>>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
>>> read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
>>>   slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
>>>   clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
>>>    lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
>>>   clat percentiles (nsec):
>>>    |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
>>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
>>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
>>>    | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
>>>    | 99.99th=[10816]
>>>  bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
>>>  iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
>>> 
>>> When we track per-second max latency in fio, we see something like this:
>>> 
>>> <time-ms>,<max-latency-ns>,,,
>>> ...
>>> 777000, 5155548, 0, 0, 0
>>> 778000, 105551, 1, 0, 0
>>> 802615, 24276019570, 0, 0, 0
>>> 802615, 82134, 1, 0, 0
>>> 804000, 9944554, 0, 0, 0
>>> 805000, 7424638, 1, 0, 0
>>> 
>>> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
>>> 
>>> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
>>> 
>>> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
>>> 
>> 
>> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
>> 
>> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
>> 
>> 
>> Coly Li
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-17 13:59           ` Coly Li
@ 2024-03-18  6:16             ` Robert Pang
  2024-03-28 18:05               ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-03-18  6:16 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux

Hi Coly

Thank you for confirming. It looks like the 6.9 merge window just
opened last week so we hope it can catch it. Please update in this
thread when it gets submitted.

https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/

BTW, speaking of testing, mind if you point us to the bcache test
suite? We would like to have a look and maybe give it a try also.

Thanks
Robert

On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2024年3月17日 13:41，Robert Pang <robertpang@google.com> 写道：
> >
> > Hi Coly
> >
>
> Hi Robert,
>
> > Thank you for looking into this issue.
> >
> > We tested this patch in 5 machines with local SSD size ranging from
> > 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> > stall nor other issues. Performance was comparable before and after
> > the patch. Hope this info will be helpful.
>
> Thanks for the information.
>
> Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
>
> The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
>
> Coly Li
>
>
> >
> >
> > On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
> >>
> >> Hi Robert,
> >>
> >> Thanks for your email.
> >>
> >>> 2024年3月16日 06:45，Robert Pang <robertpang@google.com> 写道：
> >>>
> >>> Hi all
> >>>
> >>> We found this patch via google.
> >>>
> >>> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
> >>>
> >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
> >>> read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
> >>>   slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
> >>>   clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
> >>>    lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
> >>>   clat percentiles (nsec):
> >>>    |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
> >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
> >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
> >>>    | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
> >>>    | 99.99th=[11328]
> >>>  bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
> >>>  iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> >>> ...
> >>>
> >>> <IO stall>
> >>>
> >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
> >>> read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
> >>>   slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
> >>>   clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
> >>>    lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
> >>>   clat percentiles (nsec):
> >>>    |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
> >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
> >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
> >>>    | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
> >>>    | 99.99th=[10816]
> >>>  bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
> >>>  iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> >>>
> >>> When we track per-second max latency in fio, we see something like this:
> >>>
> >>> <time-ms>,<max-latency-ns>,,,
> >>> ...
> >>> 777000, 5155548, 0, 0, 0
> >>> 778000, 105551, 1, 0, 0
> >>> 802615, 24276019570, 0, 0, 0
> >>> 802615, 82134, 1, 0, 0
> >>> 804000, 9944554, 0, 0, 0
> >>> 805000, 7424638, 1, 0, 0
> >>>
> >>> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> >>>
> >>> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> >>>
> >>> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> >>>
> >>
> >> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
> >>
> >> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
> >>
> >>
> >> Coly Li
> >
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-18  6:16             ` Robert Pang
@ 2024-03-28 18:05               ` Robert Pang
  2024-03-29 13:00                 ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-03-28 18:05 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux

Hi bcache developers

Greetings. Any update on this patch? How are things going with the
testing and submission upstream?

Thanks
Robert


On Sun, Mar 17, 2024 at 11:16 PM Robert Pang <robertpang@google.com> wrote:
>
> Hi Coly
>
> Thank you for confirming. It looks like the 6.9 merge window just
> opened last week so we hope it can catch it. Please update in this
> thread when it gets submitted.
>
> https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/
>
> BTW, speaking of testing, mind if you point us to the bcache test
> suite? We would like to have a look and maybe give it a try also.
>
> Thanks
> Robert
>
> On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
> >
> >
> >
> > > 2024年3月17日 13:41，Robert Pang <robertpang@google.com> 写道：
> > >
> > > Hi Coly
> > >
> >
> > Hi Robert,
> >
> > > Thank you for looking into this issue.
> > >
> > > We tested this patch in 5 machines with local SSD size ranging from
> > > 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> > > stall nor other issues. Performance was comparable before and after
> > > the patch. Hope this info will be helpful.
> >
> > Thanks for the information.
> >
> > Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
> >
> > The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
> >
> > Coly Li
> >
> >
> > >
> > >
> > > On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
> > >>
> > >> Hi Robert,
> > >>
> > >> Thanks for your email.
> > >>
> > >>> 2024年3月16日 06:45，Robert Pang <robertpang@google.com> 写道：
> > >>>
> > >>> Hi all
> > >>>
> > >>> We found this patch via google.
> > >>>
> > >>> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
> > >>>
> > >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
> > >>> read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
> > >>>   slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
> > >>>   clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
> > >>>    lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
> > >>>   clat percentiles (nsec):
> > >>>    |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
> > >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
> > >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
> > >>>    | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
> > >>>    | 99.99th=[11328]
> > >>>  bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
> > >>>  iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> > >>> ...
> > >>>
> > >>> <IO stall>
> > >>>
> > >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
> > >>> read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
> > >>>   slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
> > >>>   clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
> > >>>    lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
> > >>>   clat percentiles (nsec):
> > >>>    |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
> > >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
> > >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
> > >>>    | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
> > >>>    | 99.99th=[10816]
> > >>>  bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
> > >>>  iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> > >>>
> > >>> When we track per-second max latency in fio, we see something like this:
> > >>>
> > >>> <time-ms>,<max-latency-ns>,,,
> > >>> ...
> > >>> 777000, 5155548, 0, 0, 0
> > >>> 778000, 105551, 1, 0, 0
> > >>> 802615, 24276019570, 0, 0, 0
> > >>> 802615, 82134, 1, 0, 0
> > >>> 804000, 9944554, 0, 0, 0
> > >>> 805000, 7424638, 1, 0, 0
> > >>>
> > >>> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> > >>>
> > >>> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> > >>>
> > >>> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> > >>>
> > >>
> > >> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
> > >>
> > >> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
> > >>
> > >>
> > >> Coly Li
> > >
> >

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-28 18:05               ` Robert Pang
@ 2024-03-29 13:00                 ` Coly Li
  2024-04-11  6:44                   ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-03-29 13:00 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, Bcache Linux



> 2024年3月29日 02:05，Robert Pang <robertpang@google.com> 写道：
> 
> Hi bcache developers
> 
> Greetings. Any update on this patch? How are things going with the
> testing and submission upstream?

Hi Peng,

As I said, it will be in next merge window, not this one. If there is help necessary, I will ask :-)

Thanks.

Coly Li


> 
> 
> On Sun, Mar 17, 2024 at 11:16 PM Robert Pang <robertpang@google.com> wrote:
>> 
>> Hi Coly
>> 
>> Thank you for confirming. It looks like the 6.9 merge window just
>> opened last week so we hope it can catch it. Please update in this
>> thread when it gets submitted.
>> 
>> https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/
>> 
>> BTW, speaking of testing, mind if you point us to the bcache test
>> suite? We would like to have a look and maybe give it a try also.
>> 
>> Thanks
>> Robert
>> 
>> On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
>>> 
>>> 
>>> 
>>>> 2024年3月17日 13:41，Robert Pang <robertpang@google.com> 写道：
>>>> 
>>>> Hi Coly
>>>> 
>>> 
>>> Hi Robert,
>>> 
>>>> Thank you for looking into this issue.
>>>> 
>>>> We tested this patch in 5 machines with local SSD size ranging from
>>>> 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
>>>> stall nor other issues. Performance was comparable before and after
>>>> the patch. Hope this info will be helpful.
>>> 
>>> Thanks for the information.
>>> 
>>> Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
>>> 
>>> The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
>>> 
>>> Coly Li
>>> 
> 

[snipped]


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-29 13:00                 ` Coly Li
@ 2024-04-11  6:44                   ` Robert Pang
  2024-05-03 18:23                     ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-04-11  6:44 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux

HI Coly

Thank you for submitting it in the next merge window. This patch is
very critical because the long IO stall measured in tens of seconds
every hour is a serious issue making bcache unusable when it happens.
So we look forward to this patch.

Speaking of this GC issue, we gathered the bcache btree GC stats after
our fio benchmark on a 375GB SSD cache device with 256kB bucket size:

$ grep . /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
$ more /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
5876

However, fio directly on the SSD device itself shows pretty good performance:

Read IOPS 14,100 (110MiB/s)
Write IOPS 42,200 (330MiB/s)
Latency: 106.64 microseconds

Can you shed some light on why CG takes so long (avg 45 seconds) given
the SSD speed? And is there any way or setting to reduce the CG time
or lower the GC frequency?

One interesting thing we observed is when the SSD is encrypted via
dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
possible that GC writes the blocks one-by-one synchronously, and
dm-crypt's internal queuing and buffering mitigates the GC IO latency?

Thanks
Robert


On Fri, Mar 29, 2024 at 6:00 AM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2024年3月29日 02:05，Robert Pang <robertpang@google.com> 写道：
> >
> > Hi bcache developers
> >
> > Greetings. Any update on this patch? How are things going with the
> > testing and submission upstream?
>
> Hi Peng,
>
> As I said, it will be in next merge window, not this one. If there is help necessary, I will ask :-)
>
> Thanks.
>
> Coly Li
>
>
> >
> >
> > On Sun, Mar 17, 2024 at 11:16 PM Robert Pang <robertpang@google.com> wrote:
> >>
> >> Hi Coly
> >>
> >> Thank you for confirming. It looks like the 6.9 merge window just
> >> opened last week so we hope it can catch it. Please update in this
> >> thread when it gets submitted.
> >>
> >> https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/
> >>
> >> BTW, speaking of testing, mind if you point us to the bcache test
> >> suite? We would like to have a look and maybe give it a try also.
> >>
> >> Thanks
> >> Robert
> >>
> >> On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
> >>>
> >>>
> >>>
> >>>> 2024年3月17日 13:41，Robert Pang <robertpang@google.com> 写道：
> >>>>
> >>>> Hi Coly
> >>>>
> >>>
> >>> Hi Robert,
> >>>
> >>>> Thank you for looking into this issue.
> >>>>
> >>>> We tested this patch in 5 machines with local SSD size ranging from
> >>>> 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> >>>> stall nor other issues. Performance was comparable before and after
> >>>> the patch. Hope this info will be helpful.
> >>>
> >>> Thanks for the information.
> >>>
> >>> Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
> >>>
> >>> The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
> >>>
> >>> Coly Li
> >>>
> >
>
> [snipped]
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-04-11  6:44                   ` Robert Pang
@ 2024-05-03 18:23                     ` Coly Li
  2024-05-03 18:28                       ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-03 18:23 UTC (permalink / raw)
  To: Robert Pang, Dongsheng Yang; +Cc: Bcache Linux



> 2024年4月11日 14:44，Robert Pang <robertpang@google.com> 写道：
> 
> HI Coly
> 
> Thank you for submitting it in the next merge window. This patch is
> very critical because the long IO stall measured in tens of seconds
> every hour is a serious issue making bcache unusable when it happens.
> So we look forward to this patch.
> 
> Speaking of this GC issue, we gathered the bcache btree GC stats after
> our fio benchmark on a 375GB SSD cache device with 256kB bucket size:
> 
> $ grep . /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
> $ more /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
> 5876
> 
> However, fio directly on the SSD device itself shows pretty good performance:
> 
> Read IOPS 14,100 (110MiB/s)
> Write IOPS 42,200 (330MiB/s)
> Latency: 106.64 microseconds
> 
> Can you shed some light on why CG takes so long (avg 45 seconds) given
> the SSD speed? And is there any way or setting to reduce the CG time
> or lower the GC frequency?
> 
> One interesting thing we observed is when the SSD is encrypted via
> dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
> possible that GC writes the blocks one-by-one synchronously, and
> dm-crypt's internal queuing and buffering mitigates the GC IO latency?

Hi Robert,

Can I know In which kernel version did you test the patch?

I do a patch rebase and apply it on Linux v6.9. With a 4TB SSD as cache device, I didn’t observe obvious performance advantage of this patch.
And occasionally I a bit more GC time. It might be from my rebase modification in bch_btree_gc_finish(),
@@ -1769,6 +1771,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
        c->gc_mark_valid = 1;
        c->need_gc      = 0;

+       ca = c->cache;
+       for_each_bucket(b, ca)
+               if (b->reclaimable_in_gc)
+                       b->reclaimable_in_gc = 0;
+
        for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
                SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
                            GC_MARK_METADATA);

for_each_bucket() runs twice in bch_btree_gc_finish(). I guess maybe it is not exactly relevant to the GC time floating, but iterating all buckets twice in this patch looks a bit comfortable to me.


Hi Dongsheng,

Maybe my rebase is incorrect. Could you please post a new version which applies to the latest upstream bcache code?

Thanks in advance.


Coly Li


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-03 18:23                     ` Coly Li
@ 2024-05-03 18:28                       ` Coly Li
  2024-05-04  2:04                         ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-03 18:28 UTC (permalink / raw)
  To: Robert Pang, Dongsheng Yang; +Cc: Bcache Linux



> 2024年5月4日 02:23，Coly Li <colyli@suse.de> 写道：
> 
> 
> 
>> 2024年4月11日 14:44，Robert Pang <robertpang@google.com> 写道：
>> 
>> HI Coly
>> 
>> Thank you for submitting it in the next merge window. This patch is
>> very critical because the long IO stall measured in tens of seconds
>> every hour is a serious issue making bcache unusable when it happens.
>> So we look forward to this patch.
>> 
>> Speaking of this GC issue, we gathered the bcache btree GC stats after
>> our fio benchmark on a 375GB SSD cache device with 256kB bucket size:
>> 
>> $ grep . /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
>> $ more /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
>> 5876
>> 
>> However, fio directly on the SSD device itself shows pretty good performance:
>> 
>> Read IOPS 14,100 (110MiB/s)
>> Write IOPS 42,200 (330MiB/s)
>> Latency: 106.64 microseconds
>> 
>> Can you shed some light on why CG takes so long (avg 45 seconds) given
>> the SSD speed? And is there any way or setting to reduce the CG time
>> or lower the GC frequency?
>> 
>> One interesting thing we observed is when the SSD is encrypted via
>> dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
>> possible that GC writes the blocks one-by-one synchronously, and
>> dm-crypt's internal queuing and buffering mitigates the GC IO latency?
> 
> Hi Robert,
> 
> Can I know In which kernel version did you test the patch?
> 

Sorry I missed a bit more information here.

> I do a patch rebase and apply it on Linux v6.9. With a 4TB SSD as cache device, I didn’t observe obvious performance advantage of this patch.

When I didn’t see obvious performance advantage, the testing was on a 512G Intel Optane memory (with pmem driver) as cache device.


> And occasionally I a bit more GC time. It might be from my rebase modification in bch_btree_gc_finish(),

And for the above situation, it was on a 4TB NVMe SSD.


I guess maybe it was from my improper patch rebase. Once Dongsheng posts a new version for the latest upstream kernel bcache code, I will test the patch again.


Thanks.

Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-03 18:28                       ` Coly Li
@ 2024-05-04  2:04                         ` Robert Pang
  2024-05-04  3:08                           ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-05-04  2:04 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux


[-- Attachment #1.1: Type: text/plain, Size: 5690 bytes --]

Hi Coly,

> Can I know In which kernel version did you test the patch?

I tested in both Linux kernels 5.10 and 6.1.

> I didn’t observe obvious performance advantage of this patch.

This patch doesn't improve bcache performance. Instead, it eliminates the
IO stall in bcache that happens due to bch_allocator_thread() getting
blocked and waiting on GC to finish when GC happens.

/*
* We've run out of free buckets, we need to find some buckets
* we can invalidate. First, invalidate them in memory and add
* them to the free_inc list:
*/
retry_invalidate:
allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
       !ca->invalidate_needs_gc);
invalidate_buckets(ca);

From what you showed, it looks like your rebase is good. As you
already noticed, the original patch was based on 4.x kernel so the bucket
traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I
attached the patch rebased to 6.9 HEAD for your reference.

But to observe the IO stall before the patch, please test with a read-write
workload so GC will happen periodically enough (read-only or read-mostly
workload doesn't show the problem). For me, I used the "fio" utility to
generate a random read-write workload as follows.

# Pre-generate a 900GB test file
$ truncate -s 900G test

# Run random read-write workload for 1 hour
$ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio
--name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G
 --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat
--log_avg_msec=1000 --log_max_value=1

We include the flags "--write_lat_log=lat --log_avg_msec=1000
--log_max_value=1" so fio will dump the second-by-second max latency into a
log file at the end of test so we can when stall happens and for how long:

E.g.

$ more lat_lat.1.log
(format: <time-ms>,<max-latency-ns>,,,)
...
777000, 5155548, 0, 0, 0
778000, 105551, 1, 0, 0
802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
802615, 82134, 1, 0, 0
804000, 9944554, 0, 0, 0
805000, 7424638, 1, 0, 0

I used a 375 GB local SSD (cache device) and a 1 TB network-attached
storage (backing device). In the 1-hr run, GC starts happening about 10
minutes into the run and then happens at ~ 5 minute intervals. The stall
duration ranges from a few seconds at the beginning to close to 40 seconds
towards the end. Only about 1/2 to 2/3 of the cache is used by the end.

Note that this patch doesn't shorten the GC either. Instead, it just avoids
GC from blocking the allocator thread by first sweeping the buckets and
marking reclaimable ones quickly at the beginning of GC so the allocator
can proceed while GC continues its actual job.

We are eagerly looking forward to this patch to be merged in this coming
merge window that is expected to open in a week to two.

Thanks
Robert


On Fri, May 3, 2024 at 11:28 AM Coly Li <colyli@suse.de> wrote:

>
>
> > 2024年5月4日 02:23，Coly Li <colyli@suse.de> 写道：
> >
> >
> >
> >> 2024年4月11日 14:44，Robert Pang <robertpang@google.com> 写道：
> >>
> >> HI Coly
> >>
> >> Thank you for submitting it in the next merge window. This patch is
> >> very critical because the long IO stall measured in tens of seconds
> >> every hour is a serious issue making bcache unusable when it happens.
> >> So we look forward to this patch.
> >>
> >> Speaking of this GC issue, we gathered the bcache btree GC stats after
> >> our fio benchmark on a 375GB SSD cache device with 256kB bucket size:
> >>
> >> $ grep .
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
> >> $ more
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
> >> 5876
> >>
> >> However, fio directly on the SSD device itself shows pretty good
> performance:
> >>
> >> Read IOPS 14,100 (110MiB/s)
> >> Write IOPS 42,200 (330MiB/s)
> >> Latency: 106.64 microseconds
> >>
> >> Can you shed some light on why CG takes so long (avg 45 seconds) given
> >> the SSD speed? And is there any way or setting to reduce the CG time
> >> or lower the GC frequency?
> >>
> >> One interesting thing we observed is when the SSD is encrypted via
> >> dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
> >> possible that GC writes the blocks one-by-one synchronously, and
> >> dm-crypt's internal queuing and buffering mitigates the GC IO latency?
> >
> > Hi Robert,
> >
> > Can I know In which kernel version did you test the patch?
> >
>
> Sorry I missed a bit more information here.
>
> > I do a patch rebase and apply it on Linux v6.9. With a 4TB SSD as cache
> device, I didn’t observe obvious performance advantage of this patch.
>
> When I didn’t see obvious performance advantage, the testing was on a 512G
> Intel Optane memory (with pmem driver) as cache device.
>
>
> > And occasionally I a bit more GC time. It might be from my rebase
> modification in bch_btree_gc_finish(),
>
> And for the above situation, it was on a 4TB NVMe SSD.
>
>
> I guess maybe it was from my improper patch rebase. Once Dongsheng posts a
> new version for the latest upstream kernel bcache code, I will test the
> patch again.
>
>
> Thanks.
>
> Coly Li

[-- Attachment #1.2: Type: text/html, Size: 6648 bytes --]

[-- Attachment #2: 0001-bcache-allow-allocator-to-invalidate-bucket-in-gc.patch --]
[-- Type: text/x-patch, Size: 2963 bytes --]

---
 drivers/md/bcache/alloc.c  | 11 +++++------
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 11 +++++++++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ce13c272c387..982b36d12907 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -129,12 +129,11 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(!ca->set->gc_mark_valid);
-
-	return (!GC_MARK(b) ||
+	return ((b->reclaimable_in_gc || ca->set->gc_mark_valid) &&
+		((!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
-		can_inc_bucket_gen(b);
+		can_inc_bucket_gen(b)));
 }
 
 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -148,6 +147,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
+	b->reclaimable_in_gc = 0;
 }
 
 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -352,8 +352,7 @@ static int bch_allocator_thread(void *arg)
 		 */
 
 retry_invalidate:
-		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       !ca->invalidate_needs_gc);
+		allocator_wait(ca, !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4e6afa89921f..1d33e40d26ea 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+	uint16_t	reclaimable_in_gc:1;
 };
 
 /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 196cdacce38f..ded55958782d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1740,18 +1740,21 @@ static void btree_gc_start(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
 	ca = c->cache;
 	for_each_bucket(b, ca) {
 		b->last_gc = b->gen;
+		if (bch_can_invalidate_bucket(ca, b))
+			b->reclaimable_in_gc = 1;
+
 		if (!atomic_read(&b->pin)) {
 			SET_GC_MARK(b, 0);
 			SET_GC_SECTORS_USED(b, 0);
 		}
 	}
 
+	c->gc_mark_valid = 0;
 	mutex_unlock(&c->bucket_lock);
 }
 
@@ -1768,6 +1771,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
+	ca = c->cache;
+	for_each_bucket(b, ca)
+	    if (b->reclaimable_in_gc)
+		b->reclaimable_in_gc = 0;
+
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
@@ -1795,7 +1803,6 @@ static void bch_btree_gc_finish(struct cache_set *c)
 
 	c->avail_nbuckets = 0;
 
-	ca = c->cache;
 	ca->invalidate_needs_gc = 0;
 
 	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
-- 

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-04  2:04                         ` Robert Pang
@ 2024-05-04  3:08                           ` Coly Li
  2024-05-08  2:34                             ` Dongsheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-04  3:08 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, Bcache Linux



> 2024年5月4日 10:04，Robert Pang <robertpang@google.com> 写道：
> 
> Hi Coly,
> 
> > Can I know In which kernel version did you test the patch?
> 
> I tested in both Linux kernels 5.10 and 6.1.
> 
> > I didn’t observe obvious performance advantage of this patch.
> 
> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
> 
> /*
> * We've run out of free buckets, we need to find some buckets
> * we can invalidate. First, invalidate them in memory and add
> * them to the free_inc list:
> */
> retry_invalidate:
> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
>        !ca->invalidate_needs_gc);
> invalidate_buckets(ca);
> 
> From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
> 
> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
> 
> # Pre-generate a 900GB test file
> $ truncate -s 900G test
> 
> # Run random read-write workload for 1 hour
> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1 
> 
> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
> 

Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.



> E.g.
> 
> $ more lat_lat.1.log
> (format: <time-ms>,<max-latency-ns>,,,)
> ...
> 777000, 5155548, 0, 0, 0
> 778000, 105551, 1, 0, 0
> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
> 802615, 82134, 1, 0, 0
> 804000, 9944554, 0, 0, 0
> 805000, 7424638, 1, 0, 0
> 
> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
> 
> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
> 
> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.

In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.


Dongsheng Yang,

Could you please post a new version based on current mainline kernel code ?

Thanks.

Coly Li



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-04  3:08                           ` Coly Li
@ 2024-05-08  2:34                             ` Dongsheng Yang
  2024-05-12  5:43                               ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Dongsheng Yang @ 2024-05-08  2:34 UTC (permalink / raw)
  To: Coly Li, Robert Pang, mingzhe.zou; +Cc: Bcache Linux



在 2024/5/4 星期六 上午 11:08, Coly Li 写道:
> 
> 
>> 2024年5月4日 10:04，Robert Pang <robertpang@google.com> 写道：
>>
>> Hi Coly,
>>
>>> Can I know In which kernel version did you test the patch?
>>
>> I tested in both Linux kernels 5.10 and 6.1.
>>
>>> I didn’t observe obvious performance advantage of this patch.
>>
>> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
>>
>> /*
>> * We've run out of free buckets, we need to find some buckets
>> * we can invalidate. First, invalidate them in memory and add
>> * them to the free_inc list:
>> */
>> retry_invalidate:
>> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
>>         !ca->invalidate_needs_gc);
>> invalidate_buckets(ca);
>>
>>  From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
>>
>> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
>>
>> # Pre-generate a 900GB test file
>> $ truncate -s 900G test
>>
>> # Run random read-write workload for 1 hour
>> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
>>
>> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
>>
> 
> Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.
> 
> 
> 
>> E.g.
>>
>> $ more lat_lat.1.log
>> (format: <time-ms>,<max-latency-ns>,,,)
>> ...
>> 777000, 5155548, 0, 0, 0
>> 778000, 105551, 1, 0, 0
>> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
>> 802615, 82134, 1, 0, 0
>> 804000, 9944554, 0, 0, 0
>> 805000, 7424638, 1, 0, 0
>>
>> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
>>
>> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
>>
>> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.
> 
> In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.
> 
> 
> Dongsheng Yang,
> 
> Could you please post a new version based on current mainline kernel code ?

Hi Coly,
	Mingzhe will send a new version based on mainline.

Thanx
> 
> Thanks.
> 
> Coly Li
> 
> 
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-08  2:34                             ` Dongsheng Yang
@ 2024-05-12  5:43                               ` Robert Pang
  2024-05-12  9:41                                 ` Kernel error with 6.8.9 Pierre Juhen (IMAP)
  2024-05-13  7:43                                 ` [PATCH v2] bcache: allow allocator to invalidate bucket in gc Coly Li
  0 siblings, 2 replies; 32+ messages in thread
From: Robert Pang @ 2024-05-12  5:43 UTC (permalink / raw)
  To: Dongsheng Yang; +Cc: Coly Li, mingzhe.zou, Bcache Linux

Hi Coly

I see that Mingzhe has submitted the rebased patch [1]. Do you have a
chance to reproduce the stall and test the patch? Are we on track to
submit this patch upstream in the coming 6.10 merge window? Do you
need any help or more info?

Thanks
Robert


[1] https://lore.kernel.org/linux-bcache/1596418224.689.1715223543586.JavaMail.hmail@wm-bj-12-entmail-virt53.gy.ntes/T/#u


On Tue, May 7, 2024 at 7:34 PM Dongsheng Yang
<dongsheng.yang@easystack.cn> wrote:
>
>
>
> 在 2024/5/4 星期六 上午 11:08, Coly Li 写道:
> >
> >
> >> 2024年5月4日 10:04，Robert Pang <robertpang@google.com> 写道：
> >>
> >> Hi Coly,
> >>
> >>> Can I know In which kernel version did you test the patch?
> >>
> >> I tested in both Linux kernels 5.10 and 6.1.
> >>
> >>> I didn’t observe obvious performance advantage of this patch.
> >>
> >> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
> >>
> >> /*
> >> * We've run out of free buckets, we need to find some buckets
> >> * we can invalidate. First, invalidate them in memory and add
> >> * them to the free_inc list:
> >> */
> >> retry_invalidate:
> >> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
> >>         !ca->invalidate_needs_gc);
> >> invalidate_buckets(ca);
> >>
> >>  From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
> >>
> >> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
> >>
> >> # Pre-generate a 900GB test file
> >> $ truncate -s 900G test
> >>
> >> # Run random read-write workload for 1 hour
> >> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> >>
> >> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
> >>
> >
> > Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.
> >
> >
> >
> >> E.g.
> >>
> >> $ more lat_lat.1.log
> >> (format: <time-ms>,<max-latency-ns>,,,)
> >> ...
> >> 777000, 5155548, 0, 0, 0
> >> 778000, 105551, 1, 0, 0
> >> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
> >> 802615, 82134, 1, 0, 0
> >> 804000, 9944554, 0, 0, 0
> >> 805000, 7424638, 1, 0, 0
> >>
> >> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
> >>
> >> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
> >>
> >> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.
> >
> > In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.
> >
> >
> > Dongsheng Yang,
> >
> > Could you please post a new version based on current mainline kernel code ?
>
> Hi Coly,
>         Mingzhe will send a new version based on mainline.
>
> Thanx
> >
> > Thanks.
> >
> > Coly Li
> >
> >
> >

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Kernel error with 6.8.9
  2024-05-12  5:43                               ` Robert Pang
@ 2024-05-12  9:41                                 ` Pierre Juhen (IMAP)
  2024-05-13  7:57                                   ` Coly Li
  2024-05-13  7:43                                 ` [PATCH v2] bcache: allow allocator to invalidate bucket in gc Coly Li
  1 sibling, 1 reply; 32+ messages in thread
From: Pierre Juhen (IMAP) @ 2024-05-12  9:41 UTC (permalink / raw)
  To: Bcache Linux

Hi,

I use bcache on an nvme partition as frontend and md array ass backend.

I have the following error since I updated to kernel 6.8.9.

  UBSAN: array-index-out-of-bounds in drivers/md/bcache/bset.c:1098:3
[    7.138127] index 4 is out of range for type 'btree_iter_set [4]'
[    7.138129] CPU: 9 PID: 645 Comm: bcache-register Not tainted 
6.8.9-200.fc39.x86_64 #1
[    7.138131] Hardware name: Gigabyte Technology Co., Ltd. B550M 
DS3H/B550M DS3H, BIOS F1 12/07/2022
[    7.138133] Call Trace:
[    7.138135]  <TASK>
[    7.138137]  dump_stack_lvl+0x64/0x80
[    7.138143]  __ubsan_handle_out_of_bounds+0x95/0xd0
[    7.138148]  bch_btree_iter_push+0x4ca/0x4e0 [bcache]
[    7.138160]  bch_btree_node_read_done+0xca/0x3f0 [bcache]
[    7.138171]  bch_btree_node_read+0xe4/0x1d0 [bcache]
[    7.138180]  ? __pfx_closure_sync_fn+0x10/0x10
[    7.138183]  bch_btree_node_get.part.0+0x156/0x320 [bcache]
[    7.138192]  ? __pfx_up_write+0x10/0x10
[    7.138197]  register_bcache+0x1f31/0x2230 [bcache]
[    7.138212]  kernfs_fop_write_iter+0x136/0x1d0
[    7.138217]  vfs_write+0x29e/0x470
[    7.138222]  ksys_write+0x6f/0xf0
[    7.138224]  do_syscall_64+0x83/0x170
[    7.138229]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138232]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138234]  ? xas_find+0x75/0x1d0
[    7.138237]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138239]  ? next_uptodate_folio+0xa5/0x2e0
[    7.138243]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138245]  ? filemap_map_pages+0x474/0x550
[    7.138248]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138251]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138253]  ? do_fault+0x246/0x490
[    7.138256]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138258]  ? __handle_mm_fault+0x827/0xe40
[    7.138262]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138264]  ? __count_memcg_events+0x69/0x100
[    7.138267]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138269]  ? count_memcg_events.constprop.0+0x1a/0x30
[    7.138271]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138273]  ? handle_mm_fault+0xa2/0x360
[    7.138275]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138277]  ? do_user_addr_fault+0x304/0x690
[    7.138281]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138282]  ? srso_alias_return_thunk+0x5/0xfbef5
[    7.138285]  entry_SYSCALL_64_after_hwframe+0x78/0x80
[    7.138287] RIP: 0033:0x7f2dba570ee4
[    7.138292] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 
00 00 00 00 00 f3 0f 1e fa 80 3d 85 74 0d 00 00 74 13 b8 01 00 00 00 0f 
05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 4
8 89
[    7.138293] RSP: 002b:00007ffe3e2f1df8 EFLAGS: 00000202 ORIG_RAX: 
0000000000000001
[    7.138295] RAX: ffffffffffffffda RBX: 00007ffe3e2f1e6c RCX: 
00007f2dba570ee4
[    7.138297] RDX: 000000000000000f RSI: 00007ffe3e2f1e6c RDI: 
0000000000000003
[    7.138298] RBP: 00007ffe3e2f1e30 R08: 0000000000000073 R09: 
0000000000000001
[    7.138299] R10: 0000000000000000 R11: 0000000000000202 R12: 
000000000000000f
[    7.138300] R13: 00007ffe3e2f1e7b R14: 00007ffe3e2f1e6c R15: 
00007ffe3e2f1e40
[    7.138303]  </TASK>

The error is repeated  15 times while  reboot

  (I have a 12 threads processors).

Pierre


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-12  5:43                               ` Robert Pang
  2024-05-12  9:41                                 ` Kernel error with 6.8.9 Pierre Juhen (IMAP)
@ 2024-05-13  7:43                                 ` Coly Li
  2024-05-14  5:15                                   ` Robert Pang
  1 sibling, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-13  7:43 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, 邹明哲, Bcache Linux



> 2024年5月12日 13:43，Robert Pang <robertpang@google.com> 写道：
> 
> Hi Coly
> 
> I see that Mingzhe has submitted the rebased patch [1]. Do you have a
> chance to reproduce the stall and test the patch? Are we on track to
> submit this patch upstream in the coming 6.10 merge window? Do you
> need any help or more info?
> 

Hi Robert,

Please don’t push me. The first wave of bcache-6.10 is in linux-next now. For this patch, I need to do more pressure testing, to make me comfortable that no-space deadlock won’t be triggered.

The testing is simple, using small I/O size (512Bytes to 4KB) to do random write on writeback mode cache for long time (24-48 hours), see whether there is any warning or deadlock happens.

For me, my tests covers cache size from 256G/512G/1T/4T cache size with 20-24 CPU cores. If you may help to test on more machine and configuration, that will be helpful.

I trust you and Zheming for the allocation latency measurement, now I need to confirm that offering allocation more priority than GC won’t trigger potential no-space deadlock in practice.

Thanks.

Coly Li


> 
> 
> [1] https://lore.kernel.org/linux-bcache/1596418224.689.1715223543586.JavaMail.hmail@wm-bj-12-entmail-virt53.gy.ntes/T/#u
> 
> 
> On Tue, May 7, 2024 at 7:34 PM Dongsheng Yang
> <dongsheng.yang@easystack.cn> wrote:
>> 
>> 
>> 
>> 在 2024/5/4 星期六 上午 11:08, Coly Li 写道:
>>> 
>>> 
>>>> 2024年5月4日 10:04，Robert Pang <robertpang@google.com> 写道：
>>>> 
>>>> Hi Coly,
>>>> 
>>>>> Can I know In which kernel version did you test the patch?
>>>> 
>>>> I tested in both Linux kernels 5.10 and 6.1.
>>>> 
>>>>> I didn’t observe obvious performance advantage of this patch.
>>>> 
>>>> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
>>>> 
>>>> /*
>>>> * We've run out of free buckets, we need to find some buckets
>>>> * we can invalidate. First, invalidate them in memory and add
>>>> * them to the free_inc list:
>>>> */
>>>> retry_invalidate:
>>>> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
>>>>        !ca->invalidate_needs_gc);
>>>> invalidate_buckets(ca);
>>>> 
>>>> From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
>>>> 
>>>> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
>>>> 
>>>> # Pre-generate a 900GB test file
>>>> $ truncate -s 900G test
>>>> 
>>>> # Run random read-write workload for 1 hour
>>>> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
>>>> 
>>>> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
>>>> 
>>> 
>>> Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.
>>> 
>>> 
>>> 
>>>> E.g.
>>>> 
>>>> $ more lat_lat.1.log
>>>> (format: <time-ms>,<max-latency-ns>,,,)
>>>> ...
>>>> 777000, 5155548, 0, 0, 0
>>>> 778000, 105551, 1, 0, 0
>>>> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
>>>> 802615, 82134, 1, 0, 0
>>>> 804000, 9944554, 0, 0, 0
>>>> 805000, 7424638, 1, 0, 0
>>>> 
>>>> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
>>>> 
>>>> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
>>>> 
>>>> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.
>>> 
>>> In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.
>>> 
>>> 
>>> Dongsheng Yang,
>>> 
>>> Could you please post a new version based on current mainline kernel code ?
>> 
>> Hi Coly,
>>        Mingzhe will send a new version based on mainline.
>> 
>> Thanx
>>> 
>>> Thanks.
>>> 
>>> Coly Li
>>> 
>>> 
>>> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Kernel error with 6.8.9
  2024-05-12  9:41                                 ` Kernel error with 6.8.9 Pierre Juhen (IMAP)
@ 2024-05-13  7:57                                   ` Coly Li
  2024-05-17  0:34                                     ` Eric Wheeler
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-13  7:57 UTC (permalink / raw)
  To: Pierre Juhen (IMAP); +Cc: Bcache Linux



> 2024年5月12日 17:41，Pierre Juhen (IMAP) <pierre.juhen@orange.fr> 写道：
> 
> Hi,
> 
> I use bcache on an nvme partition as frontend and md array ass backend.
> 
> I have the following error since I updated to kernel 6.8.9.
> 
>  UBSAN: array-index-out-of-bounds in drivers/md/bcache/bset.c:1098:3
> [    7.138127] index 4 is out of range for type 'btree_iter_set [4]'
> [    7.138129] CPU: 9 PID: 645 Comm: bcache-register Not tainted 6.8.9-200.fc39.x86_64 #1
> [    7.138131] Hardware name: Gigabyte Technology Co., Ltd. B550M DS3H/B550M DS3H, BIOS F1 12/07/2022
> [    7.138133] Call Trace:
> [    7.138135]  <TASK>
> [    7.138137]  dump_stack_lvl+0x64/0x80
> [    7.138143]  __ubsan_handle_out_of_bounds+0x95/0xd0
> [    7.138148]  bch_btree_iter_push+0x4ca/0x4e0 [bcache]
> [    7.138160]  bch_btree_node_read_done+0xca/0x3f0 [bcache]
> [    7.138171]  bch_btree_node_read+0xe4/0x1d0 [bcache]
> [    7.138180]  ? __pfx_closure_sync_fn+0x10/0x10
> [    7.138183]  bch_btree_node_get.part.0+0x156/0x320 [bcache]
> [    7.138192]  ? __pfx_up_write+0x10/0x10
> [    7.138197]  register_bcache+0x1f31/0x2230 [bcache]
> [    7.138212]  kernfs_fop_write_iter+0x136/0x1d0
> [    7.138217]  vfs_write+0x29e/0x470
> [    7.138222]  ksys_write+0x6f/0xf0
> [    7.138224]  do_syscall_64+0x83/0x170
> [    7.138229]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138232]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138234]  ? xas_find+0x75/0x1d0
> [    7.138237]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138239]  ? next_uptodate_folio+0xa5/0x2e0
> [    7.138243]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138245]  ? filemap_map_pages+0x474/0x550
> [    7.138248]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138251]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138253]  ? do_fault+0x246/0x490
> [    7.138256]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138258]  ? __handle_mm_fault+0x827/0xe40
> [    7.138262]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138264]  ? __count_memcg_events+0x69/0x100
> [    7.138267]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138269]  ? count_memcg_events.constprop.0+0x1a/0x30
> [    7.138271]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138273]  ? handle_mm_fault+0xa2/0x360
> [    7.138275]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138277]  ? do_user_addr_fault+0x304/0x690
> [    7.138281]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138282]  ? srso_alias_return_thunk+0x5/0xfbef5
> [    7.138285]  entry_SYSCALL_64_after_hwframe+0x78/0x80
> [    7.138287] RIP: 0033:0x7f2dba570ee4
> [    7.138292] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d 85 74 0d 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 4
> 8 89
> [    7.138293] RSP: 002b:00007ffe3e2f1df8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001
> [    7.138295] RAX: ffffffffffffffda RBX: 00007ffe3e2f1e6c RCX: 00007f2dba570ee4
> [    7.138297] RDX: 000000000000000f RSI: 00007ffe3e2f1e6c RDI: 0000000000000003
> [    7.138298] RBP: 00007ffe3e2f1e30 R08: 0000000000000073 R09: 0000000000000001
> [    7.138299] R10: 0000000000000000 R11: 0000000000000202 R12: 000000000000000f
> [    7.138300] R13: 00007ffe3e2f1e7b R14: 00007ffe3e2f1e6c R15: 00007ffe3e2f1e40
> [    7.138303]  </TASK>
> 
> The error is repeated  15 times while  reboot
> 
>  (I have a 12 threads processors).

The fix is in linux-next and will be in 6.10 as expecting.

Thanks.

Coly Li


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-13  7:43                                 ` [PATCH v2] bcache: allow allocator to invalidate bucket in gc Coly Li
@ 2024-05-14  5:15                                   ` Robert Pang
  2024-05-14 23:39                                     ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-05-14  5:15 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, 邹明哲, Bcache Linux

Dear Coly,

Thank you for your dedication in reviewing this patch. I understand my
previous message may have come across as urgent, but I want to
emphasize the significance of this bcache operational issue as it has
been reported by multiple users.

We understand the importance of thoroughness, To that end, we have
conducted extensive, repeated testing on this patch across a range of
cache sizes (375G/750G/1.5T/3T/6T/9TB) and CPU cores
(2/4/8/16/32/48/64/80/96/128) for an hour-long run. We tested various
workloads (read-only, read-write, and write-only) with 8kB I/O size.
In addition, we did a series of 16-hour runs with 750GB cache and 16
CPU cores. Our tests, primarily in writethrough mode, haven't revealed
any issues or deadlocks.

We hope this additional testing data proves helpful. Please let us
know if there are any other specific tests or configurations you would
like us to consider.

Thank you,
Robert


On Mon, May 13, 2024 at 12:43 AM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2024年5月12日 13:43，Robert Pang <robertpang@google.com> 写道：
> >
> > Hi Coly
> >
> > I see that Mingzhe has submitted the rebased patch [1]. Do you have a
> > chance to reproduce the stall and test the patch? Are we on track to
> > submit this patch upstream in the coming 6.10 merge window? Do you
> > need any help or more info?
> >
>
> Hi Robert,
>
> Please don’t push me. The first wave of bcache-6.10 is in linux-next now. For this patch, I need to do more pressure testing, to make me comfortable that no-space deadlock won’t be triggered.
>
> The testing is simple, using small I/O size (512Bytes to 4KB) to do random write on writeback mode cache for long time (24-48 hours), see whether there is any warning or deadlock happens.
>
> For me, my tests covers cache size from 256G/512G/1T/4T cache size with 20-24 CPU cores. If you may help to test on more machine and configuration, that will be helpful.
>
> I trust you and Zheming for the allocation latency measurement, now I need to confirm that offering allocation more priority than GC won’t trigger potential no-space deadlock in practice.
>
> Thanks.
>
> Coly Li
>
>
> >
> >
> > [1] https://lore.kernel.org/linux-bcache/1596418224.689.1715223543586.JavaMail.hmail@wm-bj-12-entmail-virt53.gy.ntes/T/#u
> >
> >
> > On Tue, May 7, 2024 at 7:34 PM Dongsheng Yang
> > <dongsheng.yang@easystack.cn> wrote:
> >>
> >>
> >>
> >> 在 2024/5/4 星期六 上午 11:08, Coly Li 写道:
> >>>
> >>>
> >>>> 2024年5月4日 10:04，Robert Pang <robertpang@google.com> 写道：
> >>>>
> >>>> Hi Coly,
> >>>>
> >>>>> Can I know In which kernel version did you test the patch?
> >>>>
> >>>> I tested in both Linux kernels 5.10 and 6.1.
> >>>>
> >>>>> I didn’t observe obvious performance advantage of this patch.
> >>>>
> >>>> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
> >>>>
> >>>> /*
> >>>> * We've run out of free buckets, we need to find some buckets
> >>>> * we can invalidate. First, invalidate them in memory and add
> >>>> * them to the free_inc list:
> >>>> */
> >>>> retry_invalidate:
> >>>> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
> >>>>        !ca->invalidate_needs_gc);
> >>>> invalidate_buckets(ca);
> >>>>
> >>>> From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
> >>>>
> >>>> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
> >>>>
> >>>> # Pre-generate a 900GB test file
> >>>> $ truncate -s 900G test
> >>>>
> >>>> # Run random read-write workload for 1 hour
> >>>> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> >>>>
> >>>> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
> >>>>
> >>>
> >>> Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.
> >>>
> >>>
> >>>
> >>>> E.g.
> >>>>
> >>>> $ more lat_lat.1.log
> >>>> (format: <time-ms>,<max-latency-ns>,,,)
> >>>> ...
> >>>> 777000, 5155548, 0, 0, 0
> >>>> 778000, 105551, 1, 0, 0
> >>>> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
> >>>> 802615, 82134, 1, 0, 0
> >>>> 804000, 9944554, 0, 0, 0
> >>>> 805000, 7424638, 1, 0, 0
> >>>>
> >>>> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
> >>>>
> >>>> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
> >>>>
> >>>> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.
> >>>
> >>> In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.
> >>>
> >>>
> >>> Dongsheng Yang,
> >>>
> >>> Could you please post a new version based on current mainline kernel code ?
> >>
> >> Hi Coly,
> >>        Mingzhe will send a new version based on mainline.
> >>
> >> Thanx
> >>>
> >>> Thanks.
> >>>
> >>> Coly Li
> >>>
> >>>
> >>>
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-14  5:15                                   ` Robert Pang
@ 2024-05-14 23:39                                     ` Coly Li
  2024-05-17  0:30                                       ` Eric Wheeler
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-14 23:39 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, 邹明哲, Bcache Linux

On Mon, May 13, 2024 at 10:15:00PM -0700, Robert Pang wrote:
> Dear Coly,
>

Hi Robert,

Thanks for the email. Let me explain inline.

> Thank you for your dedication in reviewing this patch. I understand my
> previous message may have come across as urgent, but I want to
> emphasize the significance of this bcache operational issue as it has
> been reported by multiple users.
> 

What I concerned was still the testing itself. First of all, from the
following information, I see quite a lot of testings are done. I do
appreciate for the effort, which makes me confident for the quality of
this patch.

> We understand the importance of thoroughness, To that end, we have
> conducted extensive, repeated testing on this patch across a range of
> cache sizes (375G/750G/1.5T/3T/6T/9TB) and CPU cores
> (2/4/8/16/32/48/64/80/96/128) for an hour-long run. We tested various
> workloads (read-only, read-write, and write-only) with 8kB I/O size.
> In addition, we did a series of 16-hour runs with 750GB cache and 16
> CPU cores. Our tests, primarily in writethrough mode, haven't revealed
> any issues or deadlocks.
>

An hour-long run is not enough for bcache. Normally for stability prupose
at least 12-36 hours continue I/O pressure is necessary. Before Linux
v5.3 bcache will run into out-of-memory after 10 ~ 12 hours heavy randome
write workload on the server hardware Lenovo sponsored me.

This patch tends to offer high priority to allocator than gc thread, I'd
like to see what will happen if most of the cache space are allocated.

In my testing, still on the Lenovo SR650. The cache device is 512G Intel
optane memory by pmem driver, the backing device is a 4TB nvme SSD,
there are 2-way Intel Xeon processors with 48 cores and 160G DRAM on the
system. An XFS with default configuration created on the writeback mode
bcache device, and following fio job file is used,
[global]
direct=1
thread=1
lockmem=1
ioengine=libaio
random_generator=tausworthe64
group_reporting=1

[job0]
directory=/mnt/xfs/
readwrite=randwrite
numjobs=20
blocksize=4K/50:8K/30:16K/10:32K/10
iodepth=128
nrfiles=50
size=80G
time_based=1
runtime=36h

After around 10~12 hours, the cache space is almost exhuasted, and all
I/Os go bypass the cache and directly into the backing device. On this
moment, cache in used is around 96% (85% is dirty data, rested might be
journal and btree nodes). This is as expected.

Then stop the fio task, wait for writeback thread flush all dirty data
into the backing device. Now the cache space is occupied by clean data
and betree nodes. Now restart the fio writing task, an unexpected
behavior can be observed: all I/Os still go bypass the cache device and
into the backing device directly, even the cache only contains clean
data.

The above behavior turns out to be a bug from existed bcache code. When
cache space is used more than 95%, all write I/Os will go bypass the
cache. So there won't be chance to decrease the sectors counter to be
negative value to trigger garbage collection. The result is clean data
occupies all cache space but cannot be collected and re-allocate again.

Before this patch, the above issue was a bit harder to produce. Since
this patch trends to offer more priority to allocator threads than gc
threads, with very high write workload for quite long time, it is more
easier to observe the above no-space issue.

Now I fixed it and the first 8 hours run looks fine. I just continue
another 12 hours run on the same hardware configuration at this moment.

> We hope this additional testing data proves helpful. Please let us
> know if there are any other specific tests or configurations you would
> like us to consider.

The above testing information is very helpful. And bcache now is widely
deployed on business critical workload, I/O pressure testing with long
time is necessary, otherwise such regression will escape from our eyes.

Thanks. 

[snipped]

-- 
Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-14 23:39                                     ` Coly Li
@ 2024-05-17  0:30                                       ` Eric Wheeler
  2024-05-17 16:06                                         ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Eric Wheeler @ 2024-05-17  0:30 UTC (permalink / raw)
  To: Coly Li; +Cc: Robert Pang, Dongsheng Yang, 邹明哲,
	Bcache Linux

On Wed, 15 May 2024, Coly Li wrote:
> On Mon, May 13, 2024 at 10:15:00PM -0700, Robert Pang wrote:
> > Dear Coly,
> >
> 
> Hi Robert,
> 
> Thanks for the email. Let me explain inline.
>  
> > Thank you for your dedication in reviewing this patch. I understand my
> > previous message may have come across as urgent, but I want to
> > emphasize the significance of this bcache operational issue as it has
> > been reported by multiple users.
> > 
> 
> What I concerned was still the testing itself. First of all, from the
> following information, I see quite a lot of testings are done. I do
> appreciate for the effort, which makes me confident for the quality of
> this patch.
> 
> > We understand the importance of thoroughness, To that end, we have
> > conducted extensive, repeated testing on this patch across a range of
> > cache sizes (375G/750G/1.5T/3T/6T/9TB) and CPU cores
> > (2/4/8/16/32/48/64/80/96/128) for an hour-long run. We tested various
> > workloads (read-only, read-write, and write-only) with 8kB I/O size.
> > In addition, we did a series of 16-hour runs with 750GB cache and 16
> > CPU cores. Our tests, primarily in writethrough mode, haven't revealed
> > any issues or deadlocks.
> >
> 
> An hour-long run is not enough for bcache. Normally for stability prupose
> at least 12-36 hours continue I/O pressure is necessary. Before Linux
> v5.3 bcache will run into out-of-memory after 10 ~ 12 hours heavy randome
> write workload on the server hardware Lenovo sponsored me.

FYI:

We have been running the v2 patch in production on 5 different servers 
containing a total of 8 bcache volumes since April 7th this year, applied 
to 6.6.25 and later kernels. Some servers run 4k sector sizes, and others 
run 512-byte sectors for the data volume. For the cache volumes, their all 
cache devices use 512 byte sectors.

The backing storage on these servers range from 40-350 terabytes, and the 
cache sizes are in the 1-2 TB range.  We log kernel messages with 
netconsole into a centralized log server and have not had any bcache 
issues.


--
Eric Wheeler


> 
> This patch tends to offer high priority to allocator than gc thread, I'd
> like to see what will happen if most of the cache space are allocated.
> 
> In my testing, still on the Lenovo SR650. The cache device is 512G Intel
> optane memory by pmem driver, the backing device is a 4TB nvme SSD,
> there are 2-way Intel Xeon processors with 48 cores and 160G DRAM on the
> system. An XFS with default configuration created on the writeback mode
> bcache device, and following fio job file is used,
> [global]
> direct=1
> thread=1
> lockmem=1
> ioengine=libaio
> random_generator=tausworthe64
> group_reporting=1
> 
> [job0]
> directory=/mnt/xfs/
> readwrite=randwrite
> numjobs=20
> blocksize=4K/50:8K/30:16K/10:32K/10
> iodepth=128
> nrfiles=50
> size=80G
> time_based=1
> runtime=36h
> 
> After around 10~12 hours, the cache space is almost exhuasted, and all
> I/Os go bypass the cache and directly into the backing device. On this
> moment, cache in used is around 96% (85% is dirty data, rested might be
> journal and btree nodes). This is as expected.
> 
> Then stop the fio task, wait for writeback thread flush all dirty data
> into the backing device. Now the cache space is occupied by clean data
> and betree nodes. Now restart the fio writing task, an unexpected
> behavior can be observed: all I/Os still go bypass the cache device and
> into the backing device directly, even the cache only contains clean
> data.
> 
> The above behavior turns out to be a bug from existed bcache code. When
> cache space is used more than 95%, all write I/Os will go bypass the
> cache. So there won't be chance to decrease the sectors counter to be
> negative value to trigger garbage collection. The result is clean data
> occupies all cache space but cannot be collected and re-allocate again.
> 
> Before this patch, the above issue was a bit harder to produce. Since
> this patch trends to offer more priority to allocator threads than gc
> threads, with very high write workload for quite long time, it is more
> easier to observe the above no-space issue.
> 
> Now I fixed it and the first 8 hours run looks fine. I just continue
> another 12 hours run on the same hardware configuration at this moment.
>  
> > We hope this additional testing data proves helpful. Please let us
> > know if there are any other specific tests or configurations you would
> > like us to consider.
> 
> The above testing information is very helpful. And bcache now is widely
> deployed on business critical workload, I/O pressure testing with long
> time is necessary, otherwise such regression will escape from our eyes.
> 
> Thanks. 
> 
> [snipped]
> 
> -- 
> Coly Li
> 
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Kernel error with 6.8.9
  2024-05-13  7:57                                   ` Coly Li
@ 2024-05-17  0:34                                     ` Eric Wheeler
  2024-05-17 15:57                                       ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Eric Wheeler @ 2024-05-17  0:34 UTC (permalink / raw)
  To: Coly Li; +Cc: Pierre Juhen (IMAP), Bcache Linux

[-- Attachment #1: Type: text/plain, Size: 691 bytes --]

On Mon, 13 May 2024, Coly Li wrote:

> 
> 
> > 2024年5月12日 17:41，Pierre Juhen (IMAP) <pierre.juhen@orange.fr> 写道：
> > 
> > Hi,
> > 
> > I use bcache on an nvme partition as frontend and md array ass backend.
> > 
> > I have the following error since I updated to kernel 6.8.9.
> > 
> >  UBSAN: array-index-out-of-bounds in drivers/md/bcache/bset.c:1098:3
> > [    7.138127] index 4 is out of range for type 'btree_iter_set [4]'
...
> 
> The fix is in linux-next and will be in 6.10 as expecting.

Thank you Coly!

Two questions:

	- What is the commit hash for this fix? 

	- Does it need to be backported to older kernels?


--
Eric Wheeler


> 
> Thanks.
> 
> Coly Li
> 
> 
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Kernel error with 6.8.9
  2024-05-17  0:34                                     ` Eric Wheeler
@ 2024-05-17 15:57                                       ` Coly Li
  0 siblings, 0 replies; 32+ messages in thread
From: Coly Li @ 2024-05-17 15:57 UTC (permalink / raw)
  To: Eric Wheeler; +Cc: Pierre Juhen (IMAP), Bcache Linux



> 2024年5月17日 08:34，Eric Wheeler <bcache@lists.ewheeler.net> 写道：
> 
> On Mon, 13 May 2024, Coly Li wrote:
> 
>> 
>> 
>>> 2024年5月12日 17:41，Pierre Juhen (IMAP) <pierre.juhen@orange.fr> 写道：
>>> 
>>> Hi,
>>> 
>>> I use bcache on an nvme partition as frontend and md array ass backend.
>>> 
>>> I have the following error since I updated to kernel 6.8.9.
>>> 
>>> UBSAN: array-index-out-of-bounds in drivers/md/bcache/bset.c:1098:3
>>> [    7.138127] index 4 is out of range for type 'btree_iter_set [4]'
> ...
>> 
>> The fix is in linux-next and will be in 6.10 as expecting.
> 
> Thank you Coly!
> 
> Two questions:
> 
> - What is the commit hash for this fix? 

It is commit 3a861560ccb3 (“ bcache: fix variable length array abuse in btree_iter”) from Linus tree.

> 
> - Does it need to be backported to older kernels?
> 

This is a patch to moving warning, the original code works fine. IMHO it is not mandatory to backport to elder kernels, but good to have if UBSAN also complains in that kernel version.

Thanks.

Coly Li



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-17  0:30                                       ` Eric Wheeler
@ 2024-05-17 16:06                                         ` Coly Li
  2024-05-17 21:47                                           ` Eric Wheeler
  2024-05-24  7:14                                           ` Robert Pang
  0 siblings, 2 replies; 32+ messages in thread
From: Coly Li @ 2024-05-17 16:06 UTC (permalink / raw)
  To: Eric Wheeler
  Cc: Robert Pang, Dongsheng Yang, 邹明哲,
	Bcache Linux



> 2024年5月17日 08:30，Eric Wheeler <bcache@lists.ewheeler.net> 写道：
> 
> On Wed, 15 May 2024, Coly Li wrote:
>> On Mon, May 13, 2024 at 10:15:00PM -0700, Robert Pang wrote:
>>> Dear Coly,
>>> 
>> 
>> Hi Robert,
>> 
>> Thanks for the email. Let me explain inline.
>> 
>>> Thank you for your dedication in reviewing this patch. I understand my
>>> previous message may have come across as urgent, but I want to
>>> emphasize the significance of this bcache operational issue as it has
>>> been reported by multiple users.
>>> 
>> 
>> What I concerned was still the testing itself. First of all, from the
>> following information, I see quite a lot of testings are done. I do
>> appreciate for the effort, which makes me confident for the quality of
>> this patch.
>> 
>>> We understand the importance of thoroughness, To that end, we have
>>> conducted extensive, repeated testing on this patch across a range of
>>> cache sizes (375G/750G/1.5T/3T/6T/9TB) and CPU cores
>>> (2/4/8/16/32/48/64/80/96/128) for an hour-long run. We tested various
>>> workloads (read-only, read-write, and write-only) with 8kB I/O size.
>>> In addition, we did a series of 16-hour runs with 750GB cache and 16
>>> CPU cores. Our tests, primarily in writethrough mode, haven't revealed
>>> any issues or deadlocks.
>>> 
>> 
>> An hour-long run is not enough for bcache. Normally for stability prupose
>> at least 12-36 hours continue I/O pressure is necessary. Before Linux
>> v5.3 bcache will run into out-of-memory after 10 ~ 12 hours heavy randome
>> write workload on the server hardware Lenovo sponsored me.
> 
> FYI:
> 
> We have been running the v2 patch in production on 5 different servers 
> containing a total of 8 bcache volumes since April 7th this year, applied 
> to 6.6.25 and later kernels. Some servers run 4k sector sizes, and others 
> run 512-byte sectors for the data volume. For the cache volumes, their all 
> cache devices use 512 byte sectors.
> 
> The backing storage on these servers range from 40-350 terabytes, and the 
> cache sizes are in the 1-2 TB range.  We log kernel messages with 
> netconsole into a centralized log server and have not had any bcache 
> issues.


Thanks for the information.
The issue I stated didn’t generate kernel message. It just causes all I/Os bypass the almost fully occupied cache even it is all clean data.
Anyway this is not directly caused by this patch, this patch just makes it more easier to arrive such situation before I found and fixed it.


And to all contributors (including Dongsheng, Mingzhe, Robert, Eric and others),

At this moment I see it works fine on my server. I am about to submit it to Jens next week, if no other issue pops up.

Thanks.

Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-17 16:06                                         ` Coly Li
@ 2024-05-17 21:47                                           ` Eric Wheeler
  2024-05-24  7:14                                           ` Robert Pang
  1 sibling, 0 replies; 32+ messages in thread
From: Eric Wheeler @ 2024-05-17 21:47 UTC (permalink / raw)
  To: Coly Li; +Cc: Robert Pang, Dongsheng Yang, 邹明哲,
	Bcache Linux

[-- Attachment #1: Type: text/plain, Size: 3075 bytes --]

On Sat, 18 May 2024, Coly Li wrote:

> 
> 
> > 2024年5月17日 08:30，Eric Wheeler <bcache@lists.ewheeler.net> 写道：
> > 
> > On Wed, 15 May 2024, Coly Li wrote:
> >> On Mon, May 13, 2024 at 10:15:00PM -0700, Robert Pang wrote:
> >>> Dear Coly,
> >>> 
> >> 
> >> Hi Robert,
> >> 
> >> Thanks for the email. Let me explain inline.
> >> 
> >>> Thank you for your dedication in reviewing this patch. I understand my
> >>> previous message may have come across as urgent, but I want to
> >>> emphasize the significance of this bcache operational issue as it has
> >>> been reported by multiple users.
> >>> 
> >> 
> >> What I concerned was still the testing itself. First of all, from the
> >> following information, I see quite a lot of testings are done. I do
> >> appreciate for the effort, which makes me confident for the quality of
> >> this patch.
> >> 
> >>> We understand the importance of thoroughness, To that end, we have
> >>> conducted extensive, repeated testing on this patch across a range of
> >>> cache sizes (375G/750G/1.5T/3T/6T/9TB) and CPU cores
> >>> (2/4/8/16/32/48/64/80/96/128) for an hour-long run. We tested various
> >>> workloads (read-only, read-write, and write-only) with 8kB I/O size.
> >>> In addition, we did a series of 16-hour runs with 750GB cache and 16
> >>> CPU cores. Our tests, primarily in writethrough mode, haven't revealed
> >>> any issues or deadlocks.
> >>> 
> >> 
> >> An hour-long run is not enough for bcache. Normally for stability prupose
> >> at least 12-36 hours continue I/O pressure is necessary. Before Linux
> >> v5.3 bcache will run into out-of-memory after 10 ~ 12 hours heavy randome
> >> write workload on the server hardware Lenovo sponsored me.
> > 
> > FYI:
> > 
> > We have been running the v2 patch in production on 5 different servers 
> > containing a total of 8 bcache volumes since April 7th this year, applied 
> > to 6.6.25 and later kernels. Some servers run 4k sector sizes, and others 
> > run 512-byte sectors for the data volume. For the cache volumes, their all 
> > cache devices use 512 byte sectors.
> > 
> > The backing storage on these servers range from 40-350 terabytes, and the 
> > cache sizes are in the 1-2 TB range.  We log kernel messages with 
> > netconsole into a centralized log server and have not had any bcache 
> > issues.
> 
> 
> Thanks for the information. The issue I stated didn’t generate kernel 
> message. It just causes all I/Os bypass the almost fully occupied cache 
> even it is all clean data. Anyway this is not directly caused by this 
> patch, this patch just makes it more easier to arrive such situation 
> before I found and fixed it.

I am glad that you were able to fix it. Did you already post the patch 
with that fix, or can you point me add a commit hash?  I am eager to try 
your fix.

--
Eric Wheeler


> 
> 
> And to all contributors (including Dongsheng, Mingzhe, Robert, Eric and others),
> 
> At this moment I see it works fine on my server. I am about to submit it to Jens next week, if no other issue pops up.
> 
> Thanks.
> 
> Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-17 16:06                                         ` Coly Li
  2024-05-17 21:47                                           ` Eric Wheeler
@ 2024-05-24  7:14                                           ` Robert Pang
  2024-05-27 18:14                                             ` Coly Li
  1 sibling, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-05-24  7:14 UTC (permalink / raw)
  To: Coly Li
  Cc: Eric Wheeler, Dongsheng Yang, 邹明哲,
	Bcache Linux

Hi Coly,

I hope this email finds you well.

I wanted to express my appreciation for your work.  I was curious if
you've had a chance to submit the patch yet? If so, would you mind
sharing the link to the Git commit?

The reason I ask is that some downstream Linux distributions are eager
to incorporate this fix into their upcoming releases once it lands.
Any information you can provide would be greatly helpful in
coordinating those efforts.

Thank you again for your assistance and for your contribution to this project.

Best regards,
Robert

On Fri, May 17, 2024 at 9:06 AM Coly Li <colyli@suse.de> wrote:
>
> And to all contributors (including Dongsheng, Mingzhe, Robert, Eric and others),
>
> At this moment I see it works fine on my server. I am about to submit it to Jens next week, if no other issue pops up.
>
> Thanks.
>
> Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-24  7:14                                           ` Robert Pang
@ 2024-05-27 18:14                                             ` Coly Li
  2024-05-28  5:50                                               ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-27 18:14 UTC (permalink / raw)
  To: Robert Pang
  Cc: Eric Wheeler, Dongsheng Yang, 邹明哲,
	Bcache Linux



> 2024年5月24日 15:14，Robert Pang <robertpang@google.com> 写道：
> 
> Hi Coly,
> 
> I hope this email finds you well.
> 
> I wanted to express my appreciation for your work.  I was curious if
> you've had a chance to submit the patch yet? If so, would you mind
> sharing the link to the Git commit?
> 

The fix from me is posted on linux-bcache mailing list just a moment ago.


> The reason I ask is that some downstream Linux distributions are eager
> to incorporate this fix into their upcoming releases once it lands.

Can I know which Linux distributions are waiting for this? Just wonder and want to know more Linux distribution officially bcache.

> Any information you can provide would be greatly helpful in
> coordinating those efforts.


The test and code review from my side are done. It is in my for-next branch,  I will submit them to upstream soon if no complain from kernel test robot.

Thanks.

Coly Li





^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-27 18:14                                             ` Coly Li
@ 2024-05-28  5:50                                               ` Robert Pang
  2024-05-29 16:24                                                 ` Coly Li
  0 siblings, 1 reply; 32+ messages in thread
From: Robert Pang @ 2024-05-28  5:50 UTC (permalink / raw)
  To: Coly Li
  Cc: Eric Wheeler, Dongsheng Yang, 邹明哲,
	Bcache Linux

On Mon, May 27, 2024 at 11:14 AM Coly Li <colyli@suse.de> wrote:
>
> > 2024年5月24日 15:14，Robert Pang <robertpang@google.com> 写道：
> >
> > Hi Coly,
> >
> > I hope this email finds you well.
> >
> > I wanted to express my appreciation for your work.  I was curious if
> > you've had a chance to submit the patch yet? If so, would you mind
> > sharing the link to the Git commit?
> >
>
> The fix from me is posted on linux-bcache mailing list just a moment ago.

Thank you for that fix also. Appreciate your diligence in resolving
this stuck bypass.

> > The reason I ask is that some downstream Linux distributions are eager
> > to incorporate this fix into their upcoming releases once it lands.
>
> Can I know which Linux distributions are waiting for this? Just wonder and want to know more Linux distribution officially bcache.

It is the Container-Optimized OS.

https://cloud.google.com/container-optimized-os/docs/legacy-release-notes#gci-dev-54-8711-0-0

> > Any information you can provide would be greatly helpful in
> > coordinating those efforts.
>
>
> The test and code review from my side are done. It is in my for-next branch,  I will submit them to upstream soon if no complain from kernel test robot.

Great to hear that. Any estimate when the test will finish and the
patch can submit?

Best regards
Robert

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-28  5:50                                               ` Robert Pang
@ 2024-05-29 16:24                                                 ` Coly Li
  2024-06-03  7:04                                                   ` Robert Pang
  0 siblings, 1 reply; 32+ messages in thread
From: Coly Li @ 2024-05-29 16:24 UTC (permalink / raw)
  To: Robert Pang
  Cc: Eric Wheeler, Dongsheng Yang, 邹明哲,
	Bcache Linux



> 2024年5月28日 13:50，Robert Pang <robertpang@google.com> 写道：
> 
> On Mon, May 27, 2024 at 11:14 AM Coly Li <colyli@suse.de> wrote:
>> 
>>> 2024年5月24日 15:14，Robert Pang <robertpang@google.com> 写道：
>>> 
>>> Hi Coly,
>>> 
>>> I hope this email finds you well.
>>> 
>>> I wanted to express my appreciation for your work.  I was curious if
>>> you've had a chance to submit the patch yet? If so, would you mind
>>> sharing the link to the Git commit?
>>> 
>> 
>> The fix from me is posted on linux-bcache mailing list just a moment ago.
> 
> Thank you for that fix also. Appreciate your diligence in resolving
> this stuck bypass.
> 
>>> The reason I ask is that some downstream Linux distributions are eager
>>> to incorporate this fix into their upcoming releases once it lands.
>> 
>> Can I know which Linux distributions are waiting for this? Just wonder and want to know more Linux distribution officially bcache.
> 
> It is the Container-Optimized OS.
> 
> https://cloud.google.com/container-optimized-os/docs/legacy-release-notes#gci-dev-54-8711-0-0
> 
>>> Any information you can provide would be greatly helpful in
>>> coordinating those efforts.
>> 
>> 
>> The test and code review from my side are done. It is in my for-next branch,  I will submit them to upstream soon if no complain from kernel test robot.
> 
> Great to hear that. Any estimate when the test will finish and the
> patch can submit?


It is in linux-block already, will be in next -rc quite soon as expecting.

Thanks.

Coly Li


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-29 16:24                                                 ` Coly Li
@ 2024-06-03  7:04                                                   ` Robert Pang
  0 siblings, 0 replies; 32+ messages in thread
From: Robert Pang @ 2024-06-03  7:04 UTC (permalink / raw)
  To: Coly Li
  Cc: Eric Wheeler, Dongsheng Yang, 邹明哲,
	Bcache Linux

Hi Coly

I am pleased to see 6.10-rc2 released today with this patch. I really
want to thank you and Dongsheng for this patch and your contributions
to bcache. Much appreciated.

Best regards
Robert

On Wed, May 29, 2024 at 6:24 AM Coly Li <colyli@suse.de> wrote:
>
> > 2024年5月28日 13:50，Robert Pang <robertpang@google.com> 写道：
> >
> > Great to hear that. Any estimate when the test will finish and the
> > patch can submit?
>
> It is in linux-block already, will be in next -rc quite soon as expecting.
>
> Thanks.
> Coly Li

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2024-06-03  7:04 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-09-10 11:21 [PATCH] bcache: allow allocator to invalidate bucket in gc Dongsheng Yang
2020-09-10 11:28 ` [PATCH v2] " Dongsheng Yang
2020-09-18  9:53   ` Coly Li
2024-03-15 22:45     ` Robert Pang
2024-03-16  2:48       ` Coly Li
2024-03-17  5:41         ` Robert Pang
2024-03-17 13:59           ` Coly Li
2024-03-18  6:16             ` Robert Pang
2024-03-28 18:05               ` Robert Pang
2024-03-29 13:00                 ` Coly Li
2024-04-11  6:44                   ` Robert Pang
2024-05-03 18:23                     ` Coly Li
2024-05-03 18:28                       ` Coly Li
2024-05-04  2:04                         ` Robert Pang
2024-05-04  3:08                           ` Coly Li
2024-05-08  2:34                             ` Dongsheng Yang
2024-05-12  5:43                               ` Robert Pang
2024-05-12  9:41                                 ` Kernel error with 6.8.9 Pierre Juhen (IMAP)
2024-05-13  7:57                                   ` Coly Li
2024-05-17  0:34                                     ` Eric Wheeler
2024-05-17 15:57                                       ` Coly Li
2024-05-13  7:43                                 ` [PATCH v2] bcache: allow allocator to invalidate bucket in gc Coly Li
2024-05-14  5:15                                   ` Robert Pang
2024-05-14 23:39                                     ` Coly Li
2024-05-17  0:30                                       ` Eric Wheeler
2024-05-17 16:06                                         ` Coly Li
2024-05-17 21:47                                           ` Eric Wheeler
2024-05-24  7:14                                           ` Robert Pang
2024-05-27 18:14                                             ` Coly Li
2024-05-28  5:50                                               ` Robert Pang
2024-05-29 16:24                                                 ` Coly Li
2024-06-03  7:04                                                   ` Robert Pang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).