* [PATCH v4 1/2] bcache: Separate bch_moving_gc() from bch_btree_gc() @ 2023-07-10 7:07 Mingzhe Zou 2023-07-10 7:07 ` [PATCH v4 2/2] bcache: only copy dirty data during moving gc Mingzhe Zou 0 siblings, 1 reply; 2+ messages in thread From: Mingzhe Zou @ 2023-07-10 7:07 UTC (permalink / raw) To: colyli, linux-bcache; +Cc: bcache, zoumingzhe From: Mingzhe Zou <zoumingzhe@qq.com> Moving gc uses cache->heap to defragment disk. Unlike btree gc, moving gc only takes up part of the disk bandwidth. The number of heap is constant. However, the buckets released by each moving gc is limited. So bch_moving_gc() needs to be called multiple times. If bch_gc_thread() always calls bch_btree_gc(), it will block the IO request.This patch allows bch_gc_thread() to only call bch_moving_gc() when there are many fragments. Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn> --- drivers/md/bcache/bcache.h | 4 +- drivers/md/bcache/btree.c | 73 ++++++++++++++++++++++++++++++++++-- drivers/md/bcache/movinggc.c | 7 +++- drivers/md/bcache/super.c | 2 + 4 files changed, 81 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5a79bb3c272f..155deff0ce05 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -461,7 +461,8 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned int invalidate_needs_gc; + unsigned int invalidate_needs_gc:1; + unsigned int only_moving_gc:1; bool discard; /* Get rid of? */ @@ -629,6 +630,7 @@ struct cache_set { struct gc_stat gc_stats; size_t nbuckets; size_t avail_nbuckets; + size_t fragment_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fd121a61f17c..475ae69b1916 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -88,6 +88,7 @@ * Test module load/unload */ +#define COPY_GC_PERCENT 5 #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 #define MAX_GC_TIMES 100 @@ -1726,6 +1727,7 @@ static void btree_gc_start(struct cache_set *c) mutex_lock(&c->bucket_lock); + set_gc_sectors(c); c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; @@ -1846,8 +1848,58 @@ static void bch_btree_gc(struct cache_set *c) memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); +} + +extern unsigned int bch_cutoff_writeback; +extern unsigned int bch_cutoff_writeback_sync; + +static bool moving_gc_should_run(struct cache_set *c) +{ + struct bucket *b; + struct cache *ca = c->cache; + size_t moving_gc_threshold = ca->sb.bucket_size >> 2, frag_percent; + unsigned long used_buckets = 0, frag_buckets = 0, move_buckets = 0; + unsigned long dirty_sectors = 0, frag_sectors = 0, used_sectors = 0; + + mutex_lock(&c->bucket_lock); + for_each_bucket(b, ca) { + if (GC_MOVE(b) || GC_MARK(b) != GC_MARK_DIRTY) + continue; + + used_buckets++; + + used_sectors = GC_SECTORS_USED(b); + dirty_sectors += used_sectors; + + if (used_sectors < ca->sb.bucket_size) + frag_buckets++; + + if (used_sectors <= moving_gc_threshold) + move_buckets++; + } + mutex_unlock(&c->bucket_lock); + + c->fragment_nbuckets = frag_buckets; - bch_moving_gc(c); + if (used_buckets < c->nbuckets * bch_cutoff_writeback / 100) + return false; + + if (move_buckets > ca->heap.size) + return true; + + frag_sectors = used_buckets * ca->sb.bucket_size - dirty_sectors; + frag_percent = div_u64(frag_sectors * 100, ca->sb.bucket_size * c->nbuckets); + + if (frag_percent >= COPY_GC_PERCENT) + return true; + + if (used_buckets > c->nbuckets * bch_cutoff_writeback_sync / 100) + return true; + + if (c->gc_stats.in_use > bch_cutoff_writeback_sync && frag_buckets > 0) + return true; + + return false; } static bool gc_should_run(struct cache_set *c) @@ -1860,6 +1912,19 @@ static bool gc_should_run(struct cache_set *c) if (atomic_read(&c->sectors_to_gc) < 0) return true; + /* + * Moving gc uses cache->heap to defragment disk. Unlike btree gc, + * moving gc only takes up part of the disk bandwidth. + * The number of heap is constant. However, the buckets released by + * each moving gc is limited. So bch_moving_gc() needs to be called + * multiple times. If bch_gc_thread() always calls bch_btree_gc(), + * it will block the IO request. + */ + if (c->copy_gc_enabled && moving_gc_should_run(c)) { + ca->only_moving_gc = 1; + return true; + } + return false; } @@ -1877,8 +1942,10 @@ static int bch_gc_thread(void *arg) test_bit(CACHE_SET_IO_DISABLE, &c->flags)) break; - set_gc_sectors(c); - bch_btree_gc(c); + if (!c->cache->only_moving_gc) + bch_btree_gc(c); + + bch_moving_gc(c); } wait_for_kthread_stop(); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 9f32901fdad1..93a449226f36 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -200,6 +200,8 @@ void bch_moving_gc(struct cache_set *c) struct bucket *b; unsigned long sectors_to_move, reserve_sectors; + c->cache->only_moving_gc = 0; + if (!c->copy_gc_enabled) return; @@ -212,7 +214,7 @@ void bch_moving_gc(struct cache_set *c) ca->heap.used = 0; for_each_bucket(b, ca) { - if (GC_MARK(b) == GC_MARK_METADATA || + if (GC_MOVE(b) || GC_MARK(b) == GC_MARK_METADATA || !GC_SECTORS_USED(b) || GC_SECTORS_USED(b) == ca->sb.bucket_size || atomic_read(&b->pin)) @@ -235,6 +237,9 @@ void bch_moving_gc(struct cache_set *c) sectors_to_move -= GC_SECTORS_USED(b); } + pr_info("moving gc: on set %pU, %lu sectors from %zu buckets", + c->sb.set_uuid, sectors_to_move, ca->heap.used); + while (heap_pop(&ca->heap, b, bucket_cmp)) SET_GC_MOVE(b, 1); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0ae2b3676293..7e556bc0ec04 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2112,6 +2112,8 @@ static int run_cache_set(struct cache_set *c) if (bch_gc_thread_start(c)) goto err; + force_wake_up_gc(c); + closure_sync(&cl); c->cache->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); -- 2.17.1.windows.2 ^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH v4 2/2] bcache: only copy dirty data during moving gc 2023-07-10 7:07 [PATCH v4 1/2] bcache: Separate bch_moving_gc() from bch_btree_gc() Mingzhe Zou @ 2023-07-10 7:07 ` Mingzhe Zou 0 siblings, 0 replies; 2+ messages in thread From: Mingzhe Zou @ 2023-07-10 7:07 UTC (permalink / raw) To: colyli, linux-bcache; +Cc: bcache, zoumingzhe From: Mingzhe Zou <zoumingzhe@qq.com> When we want to shorten the moving gc interval, we must consider its impact, such as: performance, cache life. Usually ssd and nvme calculate the lifespan by the write cycles. When moving gc, only copy dirty data, which can reduce the amount of written data. This will improve moving gc speed, and extend cache life. Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn> --- drivers/md/bcache/alloc.c | 2 ++ drivers/md/bcache/bcache.h | 3 ++- drivers/md/bcache/btree.c | 12 ++++++++++-- drivers/md/bcache/movinggc.c | 16 ++++++++-------- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ce13c272c387..b6215cddef5b 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -447,6 +447,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) BUG_ON(atomic_read(&b->pin) != 1); SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + SET_GC_SECTORS_DIRTY(b, 0); if (reserve <= RESERVE_PRIO) { SET_GC_MARK(b, GC_MARK_METADATA); @@ -470,6 +471,7 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + SET_GC_SECTORS_DIRTY(b, 0); if (ca->set->avail_nbuckets < ca->set->nbuckets) { ca->set->avail_nbuckets++; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 155deff0ce05..a215d89761ba 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -199,7 +199,7 @@ struct bucket { uint16_t prio; uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ - uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint32_t gc_mark; /* Bitfield used by GC. See below for field */ }; /* @@ -215,6 +215,7 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); +BITMASK(GC_SECTORS_DIRTY, struct bucket, gc_mark, 16, GC_SECTORS_USED_SIZE); #include "journal.h" #include "stats.h" diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 475ae69b1916..4e7e66c9f542 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1250,8 +1250,13 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, if (level) SET_GC_MARK(g, GC_MARK_METADATA); - else if (KEY_DIRTY(k)) + else if (KEY_DIRTY(k)) { + SET_GC_SECTORS_DIRTY(g, GC_SECTORS_DIRTY(g) + KEY_SIZE(k)); SET_GC_MARK(g, GC_MARK_DIRTY); + + BUG_ON(GC_SECTORS_DIRTY(g) < KEY_SIZE(k) || + GC_SECTORS_DIRTY(g) > c->cache->sb.bucket_size); + } else if (!GC_MARK(g)) SET_GC_MARK(g, GC_MARK_RECLAIMABLE); @@ -1260,7 +1265,8 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, GC_SECTORS_USED(g) + KEY_SIZE(k), MAX_GC_SECTORS_USED)); - BUG_ON(!GC_SECTORS_USED(g)); + BUG_ON(GC_SECTORS_USED(g) < KEY_SIZE(k) || + GC_SECTORS_USED(g) > c->cache->sb.bucket_size); } return stale; @@ -1738,6 +1744,7 @@ static void btree_gc_start(struct cache_set *c) SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } + SET_GC_SECTORS_DIRTY(b, 0); } mutex_unlock(&c->bucket_lock); @@ -1800,6 +1807,7 @@ static void bch_btree_gc_finish(struct cache_set *c) continue; BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + BUG_ON(!GC_MARK(b) && GC_SECTORS_DIRTY(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) c->avail_nbuckets++; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 93a449226f36..ad54cdde2554 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -27,7 +27,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i) && - GC_MOVE(PTR_BUCKET(c, k, i))) + GC_MOVE(PTR_BUCKET(c, k, i)) && KEY_DIRTY(k)) return true; return false; @@ -184,14 +184,14 @@ err: if (!IS_ERR_OR_NULL(w->private)) static bool bucket_cmp(struct bucket *l, struct bucket *r) { - return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); + return GC_SECTORS_DIRTY(l) < GC_SECTORS_DIRTY(r); } static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; - return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_DIRTY(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -215,17 +215,17 @@ void bch_moving_gc(struct cache_set *c) for_each_bucket(b, ca) { if (GC_MOVE(b) || GC_MARK(b) == GC_MARK_METADATA || - !GC_SECTORS_USED(b) || - GC_SECTORS_USED(b) == ca->sb.bucket_size || + !GC_SECTORS_DIRTY(b) || + GC_SECTORS_DIRTY(b) == ca->sb.bucket_size || atomic_read(&b->pin)) continue; if (!heap_full(&ca->heap)) { - sectors_to_move += GC_SECTORS_USED(b); + sectors_to_move += GC_SECTORS_DIRTY(b); heap_add(&ca->heap, b, bucket_cmp); } else if (bucket_cmp(b, heap_peek(&ca->heap))) { sectors_to_move -= bucket_heap_top(ca); - sectors_to_move += GC_SECTORS_USED(b); + sectors_to_move += GC_SECTORS_DIRTY(b); ca->heap.data[0] = b; heap_sift(&ca->heap, 0, bucket_cmp); @@ -234,7 +234,7 @@ void bch_moving_gc(struct cache_set *c) while (sectors_to_move > reserve_sectors) { heap_pop(&ca->heap, b, bucket_cmp); - sectors_to_move -= GC_SECTORS_USED(b); + sectors_to_move -= GC_SECTORS_DIRTY(b); } pr_info("moving gc: on set %pU, %lu sectors from %zu buckets", -- 2.17.1.windows.2 ^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-07-10 7:15 UTC | newest] Thread overview: 2+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2023-07-10 7:07 [PATCH v4 1/2] bcache: Separate bch_moving_gc() from bch_btree_gc() Mingzhe Zou 2023-07-10 7:07 ` [PATCH v4 2/2] bcache: only copy dirty data during moving gc Mingzhe Zou
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).