* [Bcache v13 05/16] Export get_random_int()
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
@ 2012-05-10 3:09 ` Kent Overstreet
[not found] ` <5278ad493eb3ad441b2091b4c119d741e47f5c97.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (12 subsequent siblings)
13 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:09 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
This is needed for bcache - it needs a fast source of random numbers for
the throttling code in order to fuzz IO sizes, but the random numbers
don't need to be any good (and it immediately converts it to a
binomially distributed random number with popcount anyways).
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/char/random.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 54ca8b2..ec4dd79 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1357,6 +1357,7 @@ unsigned int get_random_int(void)
return ret;
}
+EXPORT_SYMBOL(get_random_int);
/*
* randomize_range() returns a start address such that
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
[parent not found: <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>]
* [Bcache v13 01/16] Only clone bio vecs that are in use
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-05-10 3:08 ` Kent Overstreet
[not found] ` <cb817596299fecd01ea36e4a80203f23165bda75.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:08 ` [Bcache v13 02/16] Bio pool freeing Kent Overstreet
` (8 subsequent siblings)
9 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:08 UTC (permalink / raw)
To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA
Cc: tejun-hpIqsD4AKlfQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
Bcache creates large bios internally, and then splits them according to
the requirements of the underlying device. If the underlying device then
needs to clone the bio, the clone will fail if the original bio had more
than 256 segments - even if bi_vcnt - bi_idx was smaller.
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
block/blk-core.c | 2 +-
drivers/block/rbd.c | 2 +-
drivers/md/dm.c | 27 ++++++++++-----------------
drivers/md/md.c | 3 ++-
fs/bio.c | 46 +++++++++++++++++++++++-----------------------
include/linux/bio.h | 7 ++++++-
6 files changed, 43 insertions(+), 44 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 3a78b00..87fd3f1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2659,7 +2659,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
blk_rq_init(NULL, rq);
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
if (!bio)
goto free_and_out;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a6278e7..d34e859 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -699,7 +699,7 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
}
while (old_chain && (total < len)) {
- tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ tmp = bio_kmalloc(gfpmask, bio_segments(old_chain));
if (!tmp)
goto err_out;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548..0785fab 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1078,28 +1078,22 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
* Creates a bio that consists of range of complete bvecs.
*/
static struct bio *clone_bio(struct bio *bio, sector_t sector,
- unsigned short idx, unsigned short bv_count,
+ unsigned short bv_count,
unsigned int len, struct bio_set *bs)
{
struct bio *clone;
- clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
- __bio_clone(clone, bio);
- clone->bi_destructor = dm_bio_destructor;
+ clone = bio_clone_bioset(bio, GFP_NOIO, bs);
clone->bi_sector = sector;
- clone->bi_idx = idx;
- clone->bi_vcnt = idx + bv_count;
+ clone->bi_vcnt = bv_count;
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
-
- if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO, bs);
-
+#if 0
+ if (bio_integrity(bio))
if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
bio_integrity_trim(clone,
bio_sector_offset(bio, idx, 0), len);
- }
-
+#endif
return clone;
}
@@ -1128,9 +1122,8 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
* and discard, so no need for concern about wasted bvec allocations.
*/
- clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
- __bio_clone(clone, ci->bio);
- clone->bi_destructor = dm_bio_destructor;
+ clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs);
+
if (len) {
clone->bi_sector = ci->sector;
clone->bi_size = to_bytes(len);
@@ -1169,7 +1162,7 @@ static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
struct dm_target_io *tio;
tio = alloc_tio(ci, ti);
- clone = clone_bio(bio, ci->sector, ci->idx,
+ clone = clone_bio(bio, ci->sector,
bio->bi_vcnt - ci->idx, ci->sector_count,
ci->md->bs);
__map_bio(ti, clone, tio);
@@ -1248,7 +1241,7 @@ static int __clone_and_map(struct clone_info *ci)
}
tio = alloc_tio(ci, ti);
- clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
+ clone = clone_bio(bio, ci->sector, i - ci->idx, len,
ci->md->bs);
__map_bio(ti, clone, tio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755..961c995 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -194,7 +194,8 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
if (!mddev || !mddev->bio_set)
return bio_clone(bio, gfp_mask);
- b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+ b = bio_alloc_bioset(gfp_mask,
+ bio_segments(bio),
mddev->bio_set);
if (!b)
return NULL;
diff --git a/fs/bio.c b/fs/bio.c
index b980ecd..a965b89 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -53,6 +53,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
* IO code that does not need private memory pools.
*/
struct bio_set *fs_bio_set;
+EXPORT_SYMBOL(fs_bio_set);
/*
* Our slab pool management
@@ -313,11 +314,6 @@ err_free:
}
EXPORT_SYMBOL(bio_alloc_bioset);
-static void bio_fs_destructor(struct bio *bio)
-{
- bio_free(bio, fs_bio_set);
-}
-
/**
* bio_alloc - allocate a new bio, memory pool backed
* @gfp_mask: allocation mask to use
@@ -341,8 +337,10 @@ struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
{
struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
- if (bio)
- bio->bi_destructor = bio_fs_destructor;
+ if (bio) {
+ bio->bi_flags |= 1 << BIO_HAS_POOL;
+ bio->bi_destructor = (void *) fs_bio_set;
+ }
return bio;
}
@@ -434,18 +432,19 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
EXPORT_SYMBOL(bio_phys_segments);
/**
- * __bio_clone - clone a bio
- * @bio: destination bio
- * @bio_src: bio to clone
+ * __bio_clone - clone a bio
+ * @bio: destination bio
+ * @bio_src: bio to clone
*
* Clone a &bio. Caller will own the returned bio, but not
* the actual data it points to. Reference count of returned
- * bio will be one.
+ * bio will be one.
*/
void __bio_clone(struct bio *bio, struct bio *bio_src)
{
- memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
- bio_src->bi_max_vecs * sizeof(struct bio_vec));
+ memcpy(bio->bi_io_vec,
+ bio_iovec(bio_src),
+ bio_segments(bio_src) * sizeof(struct bio_vec));
/*
* most users will be overriding ->bi_bdev with a new target,
@@ -455,33 +454,34 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
bio->bi_bdev = bio_src->bi_bdev;
bio->bi_flags |= 1 << BIO_CLONED;
bio->bi_rw = bio_src->bi_rw;
- bio->bi_vcnt = bio_src->bi_vcnt;
+ bio->bi_vcnt = bio_segments(bio_src);
bio->bi_size = bio_src->bi_size;
- bio->bi_idx = bio_src->bi_idx;
}
EXPORT_SYMBOL(__bio_clone);
/**
- * bio_clone - clone a bio
+ * bio_clone_bioset - clone a bio
* @bio: bio to clone
* @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
*
- * Like __bio_clone, only also allocates the returned bio
+ * Like __bio_clone, only also allocates the returned bio
*/
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+ struct bio_set *bs)
{
- struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-
+ struct bio *b = bio_alloc_bioset(gfp_mask, bio_segments(bio), bs);
if (!b)
return NULL;
- b->bi_destructor = bio_fs_destructor;
__bio_clone(b, bio);
+ b->bi_flags |= 1 << BIO_HAS_POOL;
+ b->bi_destructor = (void *) bs;
if (bio_integrity(bio)) {
int ret;
- ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+ ret = bio_integrity_clone(b, bio, gfp_mask, bs);
if (ret < 0) {
bio_put(b);
@@ -491,7 +491,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
return b;
}
-EXPORT_SYMBOL(bio_clone);
+EXPORT_SYMBOL(bio_clone_bioset);
/**
* bio_get_nr_vecs - return approx number of vecs
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 129a9c0..f549b54 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -222,7 +222,7 @@ struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
extern void __bio_clone(struct bio *, struct bio *);
-extern struct bio *bio_clone(struct bio *, gfp_t);
+extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
extern void bio_init(struct bio *);
@@ -297,6 +297,11 @@ struct biovec_slab {
extern struct bio_set *fs_bio_set;
+static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+ return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
+}
+
/*
* a small number of entries is fine, not going to be performance critical.
* basically we just need to survive
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 02/16] Bio pool freeing
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:08 ` [Bcache v13 01/16] Only clone bio vecs that are in use Kent Overstreet
@ 2012-05-10 3:08 ` Kent Overstreet
[not found] ` <ba8ce9fcca87f192ff5f5d3a436eb8f4d0bcb006.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:08 ` [Bcache v13 03/16] Revert "rw_semaphore: remove up/down_read_non_owner" Kent Overstreet
` (7 subsequent siblings)
9 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:08 UTC (permalink / raw)
To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA
Cc: tejun-hpIqsD4AKlfQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
When you allocate a bio from a bio pool, to free it you have to know
where it came from; this adds a flag which, if set, means bi_destructor
is the pointer to the pool and bio_put() can do the right thing.
This is used in bcache, so we can cleanly use per device bio pools.
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
fs/bio.c | 9 +++++++--
include/linux/blk_types.h | 2 ++
2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/fs/bio.c b/fs/bio.c
index a965b89..6a967fc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -235,7 +235,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
{
void *p;
- if (bio_has_allocated_vec(bio))
+ if (bio_flagged(bio, BIO_HAS_VEC))
bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
if (bio_integrity(bio))
@@ -301,6 +301,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
goto err_free;
nr_iovecs = bvec_nr_vecs(idx);
+ bio->bi_flags |= 1 << BIO_HAS_VEC;
}
out_set:
bio->bi_flags |= idx << BIO_POOL_OFFSET;
@@ -417,7 +418,11 @@ void bio_put(struct bio *bio)
*/
if (atomic_dec_and_test(&bio->bi_cnt)) {
bio->bi_next = NULL;
- bio->bi_destructor(bio);
+
+ if (bio_flagged(bio, BIO_HAS_POOL))
+ bio_free(bio, (void *) bio->bi_destructor);
+ else
+ bio->bi_destructor(bio);
}
}
EXPORT_SYMBOL(bio_put);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4053cbd..a0be8b3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -95,6 +95,8 @@ struct bio {
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_HAS_POOL 12 /* bi_destructor points to bio_pool */
+#define BIO_HAS_VEC 13 /* bio_free() should free bvec */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 03/16] Revert "rw_semaphore: remove up/down_read_non_owner"
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:08 ` [Bcache v13 01/16] Only clone bio vecs that are in use Kent Overstreet
2012-05-10 3:08 ` [Bcache v13 02/16] Bio pool freeing Kent Overstreet
@ 2012-05-10 3:08 ` Kent Overstreet
[not found] ` <3f51ec3e69b8f471e2d1cc539f01504e2b903fed.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-15 16:38 ` Tejun Heo
2012-05-10 3:09 ` [Bcache v13 04/16] Fix ratelimit macro to compile in c99 mode Kent Overstreet
` (6 subsequent siblings)
9 siblings, 2 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:08 UTC (permalink / raw)
To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA
Cc: tejun-hpIqsD4AKlfQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
This reverts commit 11b80f459adaf91a712f95e7734a17655a36bf30.
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
include/linux/rwsem.h | 10 ++++++++++
kernel/rwsem.c | 16 ++++++++++++++++
2 files changed, 26 insertions(+), 0 deletions(-)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 63d4065..5f0dc75 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -126,9 +126,19 @@ extern void downgrade_write(struct rw_semaphore *sem);
*/
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+/*
+ * Take/release a lock when not the owner will release it.
+ *
+ * [ This API should be avoided as much as possible - the
+ * proper abstraction for this case is completions. ]
+ */
+extern void down_read_non_owner(struct rw_semaphore *sem);
+extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass) down_read(sem)
# define down_write_nested(sem, subclass) down_write(sem)
+# define down_read_non_owner(sem) down_read(sem)
+# define up_read_non_owner(sem) up_read(sem)
#endif
#endif /* _LINUX_RWSEM_H */
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74..66de558 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -117,6 +117,15 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_nested);
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
@@ -127,6 +136,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
#endif
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 04/16] Fix ratelimit macro to compile in c99 mode
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (2 preceding siblings ...)
2012-05-10 3:08 ` [Bcache v13 03/16] Revert "rw_semaphore: remove up/down_read_non_owner" Kent Overstreet
@ 2012-05-10 3:09 ` Kent Overstreet
[not found] ` <d7cfd6b70316efc3fe2ce575203d906a610e3670.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:09 ` [Bcache v13 06/16] Export blk_fill_rwbs() Kent Overstreet
` (5 subsequent siblings)
9 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:09 UTC (permalink / raw)
To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA
Cc: tejun-hpIqsD4AKlfQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
include/linux/ratelimit.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index e11ccb4..9ad57d3 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -20,7 +20,7 @@ struct ratelimit_state {
#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) \
\
struct ratelimit_state name = { \
- .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
+ .lock = __RAW_SPIN_LOCK_INITIALIZER(name.lock),\
.interval = interval_init, \
.burst = burst_init, \
}
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 06/16] Export blk_fill_rwbs()
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (3 preceding siblings ...)
2012-05-10 3:09 ` [Bcache v13 04/16] Fix ratelimit macro to compile in c99 mode Kent Overstreet
@ 2012-05-10 3:09 ` Kent Overstreet
2012-05-10 3:11 ` [Bcache v13 16/16] bcache: Debug and tracing code Kent Overstreet
` (4 subsequent siblings)
9 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:09 UTC (permalink / raw)
To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA
Cc: tejun-hpIqsD4AKlfQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
Used by bcache's tracepoints.
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
kernel/trace/blktrace.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cdea7b5..e26a350 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1820,6 +1820,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i] = '\0';
}
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
#endif /* CONFIG_EVENT_TRACING */
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 16/16] bcache: Debug and tracing code
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (4 preceding siblings ...)
2012-05-10 3:09 ` [Bcache v13 06/16] Export blk_fill_rwbs() Kent Overstreet
@ 2012-05-10 3:11 ` Kent Overstreet
2012-05-10 18:34 ` [Bcache v13 00/16] Dan Williams
` (3 subsequent siblings)
9 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:11 UTC (permalink / raw)
To: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA
Cc: tejun-hpIqsD4AKlfQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
drivers/block/bcache/debug.c | 574 ++++++++++++++++++++++++++++++++++++++++++
drivers/block/bcache/debug.h | 53 ++++
drivers/block/bcache/trace.c | 26 ++
3 files changed, 653 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/debug.c
create mode 100644 drivers/block/bcache/debug.h
create mode 100644 drivers/block/bcache/trace.c
diff --git a/drivers/block/bcache/debug.c b/drivers/block/bcache/debug.c
new file mode 100644
index 0000000..a9a8369
--- /dev/null
+++ b/drivers/block/bcache/debug.c
@@ -0,0 +1,574 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *debug;
+
+/* Various debug code */
+
+const char *ptr_status(struct cache_set *c, const struct bkey *k)
+{
+ for (unsigned i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i)) {
+ struct cache *ca = PTR_CACHE(c, k, i);
+ size_t bucket = PTR_BUCKET_NR(c, k, i);
+ size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+ if (KEY_SIZE(k) + r > c->sb.bucket_size)
+ return "bad, length too big";
+ if (bucket < ca->sb.first_bucket)
+ return "bad, short offset";
+ if (bucket >= ca->sb.nbuckets)
+ return "bad, offset past end of device";
+ if (ptr_stale(c, k, i))
+ return "stale";
+ }
+
+ if (!bkey_cmp(k, &ZERO_KEY))
+ return "bad, null key";
+ if (!KEY_PTRS(k))
+ return "bad, no pointers";
+ if (!KEY_SIZE(k))
+ return "zeroed key";
+ return "";
+}
+
+static bool skipped_backwards(struct btree *b, struct bkey *k)
+{
+ return bkey_cmp(k, (!b->level) ? &START_KEY(next(k)) : next(k)) > 0;
+}
+
+static void dump_bset(struct btree *b, struct bset *i)
+{
+ for (struct bkey *k = i->start; k < end(i); k = next(k)) {
+ printk(KERN_ERR "block %zu key %zu/%i: %s", index(i, b),
+ (uint64_t *) k - i->d, i->keys, pkey(k));
+
+ for (unsigned j = 0; j < KEY_PTRS(k); j++) {
+ size_t n = PTR_BUCKET_NR(b->c, k, j);
+ printk(" bucket %zu", n);
+
+ if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+ printk(" prio %i",
+ PTR_BUCKET(b->c, k, j)->prio);
+ }
+
+ printk(" %s\n", ptr_status(b->c, k));
+
+ if (next(k) < end(i) &&
+ skipped_backwards(b, k))
+ printk(KERN_ERR "Key skipped backwards\n");
+ }
+}
+
+static void vdump_bucket_and_panic(struct btree *b, const char *m, va_list args)
+{
+ struct bset *i;
+
+ console_lock();
+
+ for_each_sorted_set(b, i)
+ dump_bset(b, i);
+
+ vprintk(m, args);
+
+ console_unlock();
+
+ panic("at %s\n", pbtree(b));
+}
+
+static void dump_bucket_and_panic(struct btree *b, const char *m, ...)
+{
+ va_list args;
+ va_start(args, m);
+ vdump_bucket_and_panic(b, m, args);
+ va_end(args);
+}
+
+static void __maybe_unused
+dump_key_and_panic(struct btree *b, struct bset *i, int j)
+{
+ long bucket = PTR_BUCKET_NR(b->c, node(i, j), 0);
+ long r = PTR_OFFSET(node(i, j), 0) & ~(~0 << b->c->bucket_bits);
+
+ printk(KERN_ERR "level %i block %zu key %i/%i: %s "
+ "bucket %llu offset %li into bucket\n",
+ b->level, index(i, b), j, i->keys, pkey(node(i, j)),
+ (uint64_t) bucket, r);
+ dump_bucket_and_panic(b, "");
+}
+
+struct keyprint_hack bcache_pkey(const struct bkey *k)
+{
+ unsigned i = 0;
+ struct keyprint_hack r;
+ char *out = r.s, *end = r.s + KEYHACK_SIZE;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+ p("%llu:%llu len %llu -> [", KEY_DEV(k), k->key, KEY_SIZE(k));
+
+ if (KEY_PTRS(k))
+ while (1) {
+ p("%llu:%llu gen %llu",
+ PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
+
+ if (++i == KEY_PTRS(k))
+ break;
+
+ p(", ");
+ }
+
+ p("]");
+
+ if (KEY_DIRTY(k))
+ p(" dirty");
+ if (KEY_CSUM(k))
+ p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+ return r;
+}
+
+struct keyprint_hack bcache_pbtree(const struct btree *b)
+{
+ struct keyprint_hack r;
+
+ snprintf(r.s, 40, "%li level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
+ b->level, b->c->root ? b->c->root->level : -1);
+ return r;
+}
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void btree_verify(struct btree *b, struct bset *new)
+{
+ struct btree *v = b->c->verify_data;
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ if (!b->c->verify)
+ return;
+
+ closure_wait_event(&b->io.wait, &cl,
+ atomic_read(&b->io.cl.remaining) == -1);
+
+ mutex_lock(&b->c->verify_lock);
+
+ bkey_copy(&v->key, &b->key);
+ v->written = 0;
+ v->level = b->level;
+
+ btree_read(v);
+ closure_wait_event(&v->io.wait, &cl,
+ atomic_read(&b->io.cl.remaining) == -1);
+
+ if (new->keys != v->sets[0].data->keys ||
+ memcmp(new->start,
+ v->sets[0].data->start,
+ (void *) end(new) - (void *) new->start)) {
+ struct bset *i;
+ unsigned j;
+
+ console_lock();
+
+ printk(KERN_ERR "*** original memory node:\n");
+ for_each_sorted_set(b, i)
+ dump_bset(b, i);
+
+ printk(KERN_ERR "*** sorted memory node:\n");
+ dump_bset(b, new);
+
+ printk(KERN_ERR "*** on disk node:\n");
+ dump_bset(v, v->sets[0].data);
+
+ for (j = 0; j < new->keys; j++)
+ if (new->d[j] != v->sets[0].data->d[j])
+ break;
+
+ console_unlock();
+ panic("verify failed at %u\n", j);
+ }
+
+ mutex_unlock(&b->c->verify_lock);
+}
+
+static void data_verify_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ closure_put(cl);
+}
+
+void data_verify(struct search *s)
+{
+ char name[BDEVNAME_SIZE];
+ struct cached_dev *dc = container_of(s->op.d, struct cached_dev, disk);
+ struct closure *cl = &s->cl;
+ struct bio *check;
+ struct bio_vec *bv;
+ int i;
+
+ if (!s->unaligned_bvec)
+ bio_for_each_segment(bv, s->orig_bio, i)
+ bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
+
+ check = bio_clone(s->orig_bio, GFP_NOIO);
+ if (!check)
+ return;
+
+ if (bio_alloc_pages(check, GFP_NOIO))
+ goto out_put;
+
+ check->bi_rw = READ_SYNC;
+ check->bi_private = cl;
+ check->bi_end_io = data_verify_endio;
+
+ closure_bio_submit(check, cl, s->op.d->c->bio_split);
+ closure_sync(cl);
+
+ bio_for_each_segment(bv, s->orig_bio, i) {
+ void *p1 = kmap(bv->bv_page);
+ void *p2 = kmap(check->bi_io_vec[i].bv_page);
+
+ if (memcmp(p1 + bv->bv_offset,
+ p2 + bv->bv_offset,
+ bv->bv_len))
+ printk(KERN_ERR "bcache (%s): verify failed"
+ " at sector %llu\n",
+ bdevname(dc->bdev, name),
+ (uint64_t) s->orig_bio->bi_sector);
+
+ kunmap(bv->bv_page);
+ kunmap(check->bi_io_vec[i].bv_page);
+ }
+
+ __bio_for_each_segment(bv, check, i, 0)
+ __free_page(bv->bv_page);
+out_put:
+ bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+unsigned count_data(struct btree *b)
+{
+ unsigned ret = 0;
+ struct bkey *k;
+
+ if (!b->level)
+ for_each_key(b, k)
+ ret += KEY_SIZE(k);
+ return ret;
+}
+
+void check_key_order_msg(struct btree *b, struct bset *i, const char *m, ...)
+{
+ if (!i->keys)
+ return;
+
+ for (struct bkey *k = i->start; next(k) < end(i); k = next(k))
+ if (skipped_backwards(b, k)) {
+ va_list args;
+ va_start(args, m);
+
+ vdump_bucket_and_panic(b, m, args);
+ va_end(args);
+ }
+}
+
+void check_keys(struct btree *b, const char *m, ...)
+{
+ va_list args;
+ struct bkey *k, *p;
+ struct btree_iter iter;
+
+ if (b->level)
+ return;
+
+ btree_iter_init(b, &iter, NULL);
+
+ do
+ p = btree_iter_next(&iter);
+ while (p && ptr_invalid(b, p));
+
+ while ((k = btree_iter_next(&iter))) {
+ if (bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
+ printk(KERN_ERR "Keys out of order:\n");
+ goto bug;
+ }
+
+ if (ptr_invalid(b, k))
+ continue;
+
+ if (bkey_cmp(p, &START_KEY(k)) > 0) {
+ printk(KERN_ERR "Overlapping keys:\n");
+ goto bug;
+ }
+ p = k;
+ }
+ return;
+bug:
+ va_start(args, m);
+ vdump_bucket_and_panic(b, m, args);
+ va_end(args);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+static int btree_dump(struct btree *b, struct btree_op *op, struct seq_file *f,
+ const char *tabs, uint64_t *prev, uint64_t *sectors)
+{
+ struct bkey *k;
+ char buf[30];
+ uint64_t last, biggest = 0;
+
+ for_each_key(b, k) {
+ int j = (uint64_t *) k - _t->data->d;
+ if (!j)
+ last = *prev;
+
+ if (last > k->key)
+ seq_printf(f, "Key skipped backwards\n");
+
+ if (!b->level && j &&
+ last != KEY_START(k))
+ seq_printf(f, "<hole>\n");
+ else if (b->level && !ptr_bad(b, k))
+ btree(dump, k, b, op, f, tabs - 1, &last, sectors);
+
+ seq_printf(f, "%s%zi %4i: %s %s\n",
+ tabs, _t - b->sets, j, pkey(k), buf);
+
+ if (!b->level && !buf[0])
+ *sectors += KEY_SIZE(k);
+
+ last = k->key;
+ biggest = max(biggest, last);
+ }
+ *prev = biggest;
+
+ return 0;
+}
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+ static const char *tabs = "\t\t\t\t\t";
+ uint64_t last = 0, sectors = 0;
+ struct cache *ca = f->private;
+ struct cache_set *c = ca->set;
+
+ struct btree_op op;
+ btree_op_init_stack(&op);
+
+ btree_root(dump, c, &op, f, &tabs[4], &last, §ors);
+
+ seq_printf(f, "%s\n" "%llu Mb found\n",
+ pkey(&c->root->key), sectors / 2048);
+
+ closure_sync(&op.cl);
+ return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_seq_show, inode->i_private);
+}
+
+static const struct file_operations cache_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = debug_seq_open,
+ .read = seq_read,
+ .release = single_release
+};
+
+void bcache_debug_init_cache(struct cache *c)
+{
+ if (!IS_ERR_OR_NULL(debug)) {
+ char b[BDEVNAME_SIZE];
+ bdevname(c->bdev, b);
+
+ c->debug = debugfs_create_file(b, 0400, debug, c,
+ &cache_debug_ops);
+ }
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
+ const char *buffer, size_t size)
+{
+ void dump(struct btree *b)
+ {
+ for (struct bset *i = b->sets[0].data;
+ index(i, b) < btree_blocks(b) &&
+ i->seq == b->sets[0].data->seq;
+ i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
+ dump_bset(b, i);
+ }
+
+ struct cache_sb *sb;
+ struct cache_set *c;
+ struct btree *all[3], *b, *fill, *orig;
+
+ struct btree_op op;
+ btree_op_init_stack(&op);
+
+ sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
+ if (!sb)
+ return -ENOMEM;
+
+ sb->bucket_size = 128;
+ sb->block_size = 4;
+
+ c = cache_set_alloc(sb);
+ if (!c)
+ return -ENOMEM;
+
+ for (int i = 0; i < 3; i++) {
+ BUG_ON(list_empty(&c->btree_cache));
+ all[i] = list_first_entry(&c->btree_cache, struct btree, list);
+ list_del_init(&all[i]->list);
+
+ all[i]->key = KEY(0, 0, c->sb.bucket_size);
+ bkey_copy_key(&all[i]->key, &MAX_KEY);
+ }
+
+ b = all[0];
+ fill = all[1];
+ orig = all[2];
+
+ while (1) {
+ for (int i = 0; i < 3; i++)
+ all[i]->written = all[i]->nsets = 0;
+
+ bset_init_next(b);
+
+ while (1) {
+ struct bset *i = write_block(b);
+ struct bkey *k = op.keys.top;
+
+ k->key = get_random_int();
+
+ op.type = k->key & 1
+ ? BTREE_INSERT
+ : BTREE_REPLACE;
+ k->key >>= 1;
+
+ k->header = KEY_HEADER(bucket_remainder(c, k->key), 0);
+ k->key >>= c->bucket_bits;
+ k->key &= 1024 * 512 - 1;
+ k->key += c->sb.bucket_size;
+#if 0
+ SET_KEY_PTRS(k, 1);
+#endif
+ keylist_push(&op.keys);
+ bcache_btree_insert_keys(b, &op);
+
+ if (should_split(b) ||
+ set_blocks(i, b->c) !=
+ __set_blocks(i, i->keys + 15, b->c)) {
+ i->csum = csum_set(i);
+
+ memcpy(write_block(fill),
+ i, set_bytes(i));
+
+ b->written += set_blocks(i, b->c);
+ fill->written = b->written;
+ if (b->written == btree_blocks(b))
+ break;
+
+ btree_sort_lazy(b);
+ bset_init_next(b);
+ }
+ }
+
+ memcpy(orig->sets[0].data,
+ fill->sets[0].data,
+ btree_bytes(c));
+
+ btree_sort(b);
+ fill->written = 0;
+ btree_read_done(&fill->io.cl);
+
+ if (b->sets[0].data->keys != fill->sets[0].data->keys ||
+ memcmp(b->sets[0].data->start,
+ fill->sets[0].data->start,
+ b->sets[0].data->keys * sizeof(uint64_t))) {
+ struct bset *i = b->sets[0].data;
+
+ for (struct bkey *k = i->start,
+ *j = fill->sets[0].data->start;
+ k < end(i);
+ k = next(k), j = next(j))
+ if (bkey_cmp(k, j) ||
+ KEY_SIZE(k) != KEY_SIZE(j))
+ printk(KERN_ERR "key %zi differs: %s "
+ "!= %s\n", (uint64_t *) k - i->d,
+ pkey(k), pkey(j));
+
+ for (int i = 0; i < 3; i++) {
+ printk(KERN_ERR "**** Set %i ****\n", i);
+ dump(all[i]);
+ }
+ panic("\n");
+ }
+
+ printk(KERN_DEBUG "bcache: fuzz complete: %i keys\n",
+ b->sets[0].data->keys);
+ }
+}
+
+kobj_attribute_write(fuzz, btree_fuzz);
+#endif
+
+#ifdef CONFIG_BCACHE_LATENCY_DEBUG
+static ssize_t show(struct kobject *k, struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%i\n", latency_warn_ms);
+}
+
+static ssize_t store(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size)
+{
+ return strtoul_safe(buffer, latency_warn_ms) ?: (ssize_t) size;
+}
+
+kobj_attribute_rw(latency_warn_ms, show, store);
+#endif
+
+void bcache_debug_exit(void)
+{
+ if (!IS_ERR_OR_NULL(debug))
+ debugfs_remove_recursive(debug);
+}
+
+int __init bcache_debug_init(struct kobject *kobj)
+{
+ int ret = 0;
+#ifdef CONFIG_BCACHE_DEBUG
+ ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
+ if (ret)
+ return ret;
+#endif
+
+#ifdef CONFIG_BCACHE_LATENCY_DEBUG
+ ret = sysfs_create_file(kobj, &ksysfs_latency_warn_ms.attr);
+ if (ret)
+ return ret;
+#endif
+
+ debug = debugfs_create_dir("bcache", NULL);
+ return ret;
+}
diff --git a/drivers/block/bcache/debug.h b/drivers/block/bcache/debug.h
new file mode 100644
index 0000000..9f91a2a
--- /dev/null
+++ b/drivers/block/bcache/debug.h
@@ -0,0 +1,53 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+/* Btree/bkey debug printing */
+
+#define KEYHACK_SIZE 80
+struct keyprint_hack {
+ char s[KEYHACK_SIZE];
+};
+
+struct keyprint_hack bcache_pkey(const struct bkey *k);
+struct keyprint_hack bcache_pbtree(const struct btree *b);
+#define pkey(k) (bcache_pkey(k).s)
+#define pbtree(b) (bcache_pbtree(b).s)
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+unsigned count_data(struct btree *);
+void check_key_order_msg(struct btree *, struct bset *, const char *, ...);
+void check_keys(struct btree *, const char *, ...);
+
+#define check_key_order(b, i) check_key_order_msg(b, i, "keys out of order")
+#define EBUG_ON(cond) BUG_ON(cond)
+
+#else /* EDEBUG */
+
+#define count_data(b) 0
+#define check_key_order(b, i) do {} while (0)
+#define check_key_order_msg(b, i, ...) do {} while (0)
+#define check_keys(b, ...) do {} while (0)
+#define EBUG_ON(cond) do {} while (0)
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void btree_verify(struct btree *, struct bset *);
+void data_verify(struct search *);
+
+#else /* DEBUG */
+
+static inline void btree_verify(struct btree *b, struct bset *i) {}
+static inline void data_verify(struct search *s) {};
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void bcache_debug_init_cache(struct cache *);
+#else
+static inline void bcache_debug_init_cache(struct cache *c) {}
+#endif
+
+#endif
diff --git a/drivers/block/bcache/trace.c b/drivers/block/bcache/trace.c
new file mode 100644
index 0000000..983f9bb
--- /dev/null
+++ b/drivers/block/bcache/trace.c
@@ -0,0 +1,26 @@
+#include "bcache.h"
+#include "btree.h"
+#include "request.h"
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* Re: [Bcache v13 00/16]
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (5 preceding siblings ...)
2012-05-10 3:11 ` [Bcache v13 16/16] bcache: Debug and tracing code Kent Overstreet
@ 2012-05-10 18:34 ` Dan Williams
2012-05-18 10:06 ` Arnd Bergmann
` (2 subsequent siblings)
9 siblings, 0 replies; 87+ messages in thread
From: Dan Williams @ 2012-05-10 18:34 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tejun-hpIqsD4AKlfQT0dZR+AlfA,
Alasdair G Kergon, linux-raid
On Wed, May 9, 2012 at 8:07 PM, Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> bcache: a cache for arbitrary block devices using an SSD.
>
> Short overview:
> Bcache does both writethrough and writeback caching. It presents itself as a
> new block device, a bit like say md. You can cache an arbitrary number of
> block devices with a single cache device, and attach and detach things at
> runtime - it's quite flexible.
>
> It's very fast. It uses a b+ tree for the index, along with a journal to
> coalesce index updates, and a bunch of other cool tricks like auxiliary binary
> search trees with software floating point keys for searching within btree
> nodes.
>
> Bcache is solid, production ready code. There are still bugs being found that
> affect specific configurations, but there haven't been any major issues found
> in awhile - it's well past time I started working on getting it into mainline.
>
> It's a lot of code - I tried to split it out so that it'd make some sort of
> sense for reviewing. Let me know if there's anything else I can do to make
> review easier.
>
> TODO/known issues:
>
> __up_write() needs to be exported for bcache to compile as a module - it's
> used for bypassing lockdep when traversing the btree during garbage
> collection. If someone else knows a better solution, please let me know.
>
> The userspace interface is going to change before it goes in. The general
> consensus at LSF was that we don't want yet another interface for
> probing/managing block devices, and dm exists so we may as well use that. I
> don't think anyone's started on that yet, though.
Might as well mention I've started work on a md conversion. Nearing
the point of having RFC patches ready to post.
I'm doing it in such a way that it does not preclude a dm target from
coming along as well. Like I mentioned to Alasdair at LSF, there's no
reason this could not be treated the same as the shared raid
personalities between dm and md.
--
Dan
^ permalink raw reply [flat|nested] 87+ messages in thread
* Re: [Bcache v13 00/16]
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (6 preceding siblings ...)
2012-05-10 18:34 ` [Bcache v13 00/16] Dan Williams
@ 2012-05-18 10:06 ` Arnd Bergmann
2012-05-30 8:29 ` Tejun Heo
2012-05-30 8:54 ` Zhi Yong Wu
9 siblings, 0 replies; 87+ messages in thread
From: Arnd Bergmann @ 2012-05-18 10:06 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tejun-hpIqsD4AKlfQT0dZR+AlfA,
agk-H+wXaHxf7aLQT0dZR+AlfA
On Thursday 10 May 2012, Kent Overstreet wrote:
> TODO/known issues:
>
> __up_write() needs to be exported for bcache to compile as a module - it's
> used for bypassing lockdep when traversing the btree during garbage
> collection. If someone else knows a better solution, please let me know.
>
> The userspace interface is going to change before it goes in. The general
> consensus at LSF was that we don't want yet another interface for
> probing/managing block devices, and dm exists so we may as well use that. I
> don't think anyone's started on that yet, though.
>
> Documentation needs to be updated. That's being actively worked on, though.
Hi Kent,
Sorry for jumping in late in the discussion. I believe we discussed the
requirements for the low-end media when you posted v12 and it seemed that
there are issues with a lot of the low-end media you were planning to
support. I have seen devices with 12MB or 16MB erase block size, which you
didn't handle well, and many devices are severely limited on the number of
buckets (erase blocks that a device can write to concurrently), typically
3 to 8 for an SD card and slightly more for a USB drive.
Are you still planning to support those devices or are you focusing now
on other hardware? If you plan to support them, what are you current
limits on the bucket size and the number of buckets?
FWIW, Tixy has written a tool named flashsim[1] to simulate the behavior
of all kinds of flash drives, so you can use a blocktrace file as
input and it will tell you the write amplification factor that a given
drive would suffer given that workload. You can use it to find out how
your algorithms would interact with devices that can only support a
smaller number of buckets that you would actually want.
Arnd
[1] http://yxit.co.uk/public/flash-performance/
^ permalink raw reply [flat|nested] 87+ messages in thread
* Re: [Bcache v13 00/16]
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (7 preceding siblings ...)
2012-05-18 10:06 ` Arnd Bergmann
@ 2012-05-30 8:29 ` Tejun Heo
2012-05-30 8:54 ` Zhi Yong Wu
9 siblings, 0 replies; 87+ messages in thread
From: Tejun Heo @ 2012-05-30 8:29 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA
Hello, Kent.
Here are my additional impressions after trying to read a bit more
from make_request.
* Single char variable names suck. It's difficult to tell what type
they are, what they're used for and makes it difficult to track
where the variable is used in the function - try to highlight the
variable name in the editor.
While not a strict rule, there are reasons why people tend to use
single char variable names mostly for specific things - e.g. loop
variables, transient integral variables or in numerical context.
They're either localized to small scope of code so keeping track of
them is easy and/or people are familiar with such usages.
So, I would *much* prefer if I don't have to keep trying to track
what the hell c, d, k, j, l mean and where they're used.
* Due to the various style issues, lack of documentation and other
abundant idiosyncrasies in the code, at least I find the code almost
aggravating. It's complicated, difficult to read and full of
unnecessary differences and smart tricks (smart is not a positive
word here). I don't think we want code like this in the kernel.
Hell, I would get pretty upset if I encounter this type of code
while trying to update some block API.
* Maybe I haven't seen enough of it but my feeling about closure
hasn't gone up. It likely has gone further down. It doesn't
actually seem to solve the pain points of async programming while
adding ample headaches. The usages that I followed could be easily
served by either domain-specific async sequencer or the existing ref
/ async / whatever mechanism / convention. If you have good example
usage in bcache, please feel free to explain it.
So, I don't know. If this is a driver for some super obscure device
that would fall out of use in some years and doesn't interact with /
affect the rest of the kernel, maybe we can put it in with big giant
blinking red warnings about the dragons inside, but as it currently
stands I don't think I can ack the code base and am afraid that it
would need non-trivial updates to be upstreamable.
I'm gonna stop reading and, for now
NACKED-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 87+ messages in thread
* Re: [Bcache v13 00/16]
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
` (8 preceding siblings ...)
2012-05-30 8:29 ` Tejun Heo
@ 2012-05-30 8:54 ` Zhi Yong Wu
9 siblings, 0 replies; 87+ messages in thread
From: Zhi Yong Wu @ 2012-05-30 8:54 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tejun-hpIqsD4AKlfQT0dZR+AlfA,
agk-H+wXaHxf7aLQT0dZR+AlfA
On Thu, May 10, 2012 at 11:07 AM, Kent Overstreet
<koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> bcache: a cache for arbitrary block devices using an SSD.
>
> Short overview:
> Bcache does both writethrough and writeback caching. It presents itself as a
> new block device, a bit like say md. You can cache an arbitrary number of
> block devices with a single cache device, and attach and detach things at
> runtime - it's quite flexible.
>
> It's very fast. It uses a b+ tree for the index, along with a journal to
> coalesce index updates, and a bunch of other cool tricks like auxiliary binary
> search trees with software floating point keys for searching within btree
> nodes.
>
> Bcache is solid, production ready code. There are still bugs being found that
> affect specific configurations, but there haven't been any major issues found
> in awhile - it's well past time I started working on getting it into mainline.
>
> It's a lot of code - I tried to split it out so that it'd make some sort of
> sense for reviewing. Let me know if there's anything else I can do to make
This feature is so interesting; Do you have one clean public git tree
about this patchset? So that i can pull it to play with it.
> review easier.
>
> TODO/known issues:
>
> __up_write() needs to be exported for bcache to compile as a module - it's
> used for bypassing lockdep when traversing the btree during garbage
> collection. If someone else knows a better solution, please let me know.
>
> The userspace interface is going to change before it goes in. The general
> consensus at LSF was that we don't want yet another interface for
> probing/managing block devices, and dm exists so we may as well use that. I
> don't think anyone's started on that yet, though.
>
> Documentation needs to be updated. That's being actively worked on, though.
>
> Kent Overstreet (16):
> Only clone bio vecs that are in use
> Bio pool freeing
> Revert "rw_semaphore: remove up/down_read_non_owner"
> Fix ratelimit macro to compile in c99 mode
> Export get_random_int()
> Export blk_fill_rwbs()
> Closures
> bcache: Documentation, and changes to generic code
> Bcache: generic utility code
> bcache: Superblock/initialization/sysfs code
> bcache: Core btree code
> bcache: Bset code (lookups within a btree node)
> bcache: Journalling
> bcache: Request, io and allocation code
> bcache: Writeback
> bcache: Debug and tracing code
>
> Documentation/ABI/testing/sysfs-block-bcache | 156 ++
> Documentation/bcache.txt | 255 +++
> block/blk-core.c | 2 +-
> drivers/block/Kconfig | 2 +
> drivers/block/Makefile | 1 +
> drivers/block/bcache/Kconfig | 42 +
> drivers/block/bcache/Makefile | 8 +
> drivers/block/bcache/alloc.c | 591 +++++++
> drivers/block/bcache/bcache.h | 839 ++++++++++
> drivers/block/bcache/bset.c | 1149 +++++++++++++
> drivers/block/bcache/bset.h | 218 +++
> drivers/block/bcache/btree.c | 2249 ++++++++++++++++++++++++++
> drivers/block/bcache/btree.h | 272 ++++
> drivers/block/bcache/debug.c | 574 +++++++
> drivers/block/bcache/debug.h | 53 +
> drivers/block/bcache/io.c | 198 +++
> drivers/block/bcache/journal.c | 722 +++++++++
> drivers/block/bcache/journal.h | 113 ++
> drivers/block/bcache/request.c | 1470 +++++++++++++++++
> drivers/block/bcache/request.h | 58 +
> drivers/block/bcache/stats.c | 243 +++
> drivers/block/bcache/stats.h | 58 +
> drivers/block/bcache/super.c | 2000 +++++++++++++++++++++++
> drivers/block/bcache/sysfs.c | 802 +++++++++
> drivers/block/bcache/sysfs.h | 99 ++
> drivers/block/bcache/trace.c | 26 +
> drivers/block/bcache/util.c | 572 +++++++
> drivers/block/bcache/util.h | 657 ++++++++
> drivers/block/bcache/writeback.c | 518 ++++++
> drivers/block/rbd.c | 2 +-
> drivers/char/random.c | 1 +
> drivers/md/dm.c | 27 +-
> drivers/md/md.c | 3 +-
> fs/bio.c | 55 +-
> include/linux/bio.h | 7 +-
> include/linux/blk_types.h | 2 +
> include/linux/cgroup_subsys.h | 6 +
> include/linux/closure.h | 614 +++++++
> include/linux/ratelimit.h | 2 +-
> include/linux/rwsem.h | 10 +
> include/linux/sched.h | 4 +
> include/trace/events/bcache.h | 257 +++
> kernel/fork.c | 4 +
> kernel/rwsem.c | 16 +
> kernel/trace/blktrace.c | 1 +
> lib/Kconfig | 3 +
> lib/Kconfig.debug | 9 +
> lib/Makefile | 2 +
> lib/closure.c | 363 +++++
> 49 files changed, 15288 insertions(+), 47 deletions(-)
> create mode 100644 Documentation/ABI/testing/sysfs-block-bcache
> create mode 100644 Documentation/bcache.txt
> create mode 100644 drivers/block/bcache/Kconfig
> create mode 100644 drivers/block/bcache/Makefile
> create mode 100644 drivers/block/bcache/alloc.c
> create mode 100644 drivers/block/bcache/bcache.h
> create mode 100644 drivers/block/bcache/bset.c
> create mode 100644 drivers/block/bcache/bset.h
> create mode 100644 drivers/block/bcache/btree.c
> create mode 100644 drivers/block/bcache/btree.h
> create mode 100644 drivers/block/bcache/debug.c
> create mode 100644 drivers/block/bcache/debug.h
> create mode 100644 drivers/block/bcache/io.c
> create mode 100644 drivers/block/bcache/journal.c
> create mode 100644 drivers/block/bcache/journal.h
> create mode 100644 drivers/block/bcache/request.c
> create mode 100644 drivers/block/bcache/request.h
> create mode 100644 drivers/block/bcache/stats.c
> create mode 100644 drivers/block/bcache/stats.h
> create mode 100644 drivers/block/bcache/super.c
> create mode 100644 drivers/block/bcache/sysfs.c
> create mode 100644 drivers/block/bcache/sysfs.h
> create mode 100644 drivers/block/bcache/trace.c
> create mode 100644 drivers/block/bcache/util.c
> create mode 100644 drivers/block/bcache/util.h
> create mode 100644 drivers/block/bcache/writeback.c
> create mode 100644 include/linux/closure.h
> create mode 100644 include/trace/events/bcache.h
> create mode 100644 lib/closure.c
>
> --
> 1.7.9.rc2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Regards,
Zhi Yong Wu
^ permalink raw reply [flat|nested] 87+ messages in thread
* [Bcache v13 07/16] Closures
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
2012-05-10 3:09 ` [Bcache v13 05/16] Export get_random_int() Kent Overstreet
[not found] ` <cover.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2012-05-10 3:09 ` Kent Overstreet
[not found] ` <82f00ebb4ee0404788c5bd7fbfa1fe4969f28ba1.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:10 ` [Bcache v13 08/16] bcache: Documentation, and changes to generic code Kent Overstreet
` (10 subsequent siblings)
13 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:09 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Closures are asynchronous refcounty things based on workqueues, used
extensively in bcache.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
include/linux/closure.h | 614 +++++++++++++++++++++++++++++++++++++++++++++++
lib/Kconfig | 3 +
lib/Kconfig.debug | 9 +
lib/Makefile | 2 +
lib/closure.c | 363 ++++++++++++++++++++++++++++
5 files changed, 991 insertions(+), 0 deletions(-)
create mode 100644 include/linux/closure.h
create mode 100644 lib/closure.c
diff --git a/include/linux/closure.h b/include/linux/closure.h
new file mode 100644
index 0000000..c55116e
--- /dev/null
+++ b/include/linux/closure.h
@@ -0,0 +1,614 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ * continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, is a macro that returns the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio, int error)
+ * {
+ * closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * For a closure to wait on an arbitrary event, we need to introduce waitlists:
+ *
+ * closure_list_t list;
+ * closure_wait_event(list, cl, condition);
+ * closure_wake_up(wait_list);
+ *
+ * These work analagously to wait_event() and wake_up() - except that instead of
+ * operating on the current thread (for wait_event()) and lists of threads, they
+ * operate on an explicit closure and lists of closures.
+ *
+ * Because it's a closure we can now wait either synchronously or
+ * asynchronously. closure_wait_event() returns the current value of the
+ * condition, and if it returned false continue_at() or closure_sync() can be
+ * used to wait for it to become true.
+ *
+ * It's useful for waiting on things when you can't sleep in the context in
+ * which you must check the condition (perhaps a spinlock held, or you might be
+ * beneath generic_make_request() - in which case you can't sleep on IO).
+ *
+ * closure_wait_event() will wait either synchronously or asynchronously,
+ * depending on whether the closure is in blocking mode or not. You can pick a
+ * mode explicitly with closure_wait_event_sync() and
+ * closure_wait_event_async(), which do just what you might expect.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ *
+ * Locking:
+ *
+ * Closures are based on work items but they can be thought of as more like
+ * threads - in that like threads and unlike work items they have a well
+ * defined lifetime; they are created (with closure_init()) and eventually
+ * complete after a continue_at(cl, NULL, NULL).
+ *
+ * Suppose you've got some larger structure with a closure embedded in it that's
+ * used for periodically doing garbage collection. You only want one garbage
+ * collection happening at a time, so the natural thing to do is protect it with
+ * a lock. However, it's difficult to use a lock protecting a closure correctly
+ * because the unlock should come after the last continue_to() (additionally, if
+ * you're using the closure asynchronously a mutex won't work since a mutex has
+ * to be unlocked by the same process that locked it).
+ *
+ * So to make it less error prone and more efficient, we also have the ability
+ * to use closures as locks:
+ *
+ * closure_init_unlocked();
+ * closure_trylock();
+ *
+ * That's all we need for trylock() - the last closure_put() implicitly unlocks
+ * it for you. But for closure_lock(), we also need a wait list:
+ *
+ * struct closure_with_waitlist frobnicator_cl;
+ *
+ * closure_init_unlocked(&frobnicator_cl);
+ * closure_lock(&frobnicator_cl);
+ *
+ * A closure_with_waitlist embeds a closure and a wait list - much like struct
+ * delayed_work embeds a work item and a timer_list. The important thing is, use
+ * it exactly like you would a regular closure and closure_put() will magically
+ * handle everything for you.
+ *
+ * We've got closures that embed timers, too. They're called, appropriately
+ * enough:
+ * struct closure_with_timer;
+ *
+ * This gives you access to closure_sleep(). It takes a refcount for a specified
+ * number of jiffies - you could then call closure_sync() (for a slightly
+ * convoluted version of msleep()) or continue_at() - which gives you the same
+ * effect as using a delayed work item, except you can reuse the work_struct
+ * already embedded in struct closure.
+ *
+ * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
+ * probably expect, if you happen to need the features of both. (You don't
+ * really want to know how all this is implemented, but if I've done my job
+ * right you shouldn't have to care).
+ */
+
+struct closure;
+typedef void (closure_fn) (struct closure *);
+
+typedef struct llist_head closure_list_t;
+
+struct closure {
+ union {
+ struct {
+ struct workqueue_struct *wq;
+ struct task_struct *task;
+ struct llist_node list;
+ closure_fn *fn;
+ };
+ struct work_struct work;
+ };
+
+ struct closure *parent;
+
+#define CLOSURE_REMAINING_MASK (~(~0 << 20))
+#define CLOSURE_GUARD_MASK \
+ ((1 << 20)|(1 << 22)|(1 << 24)|(1 << 26)|(1 << 28)|(1 << 30))
+
+ /*
+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+ * closure_init() and when closure_put() runs then next function), and
+ * must be cleared before remaining hits 0. Primarily to help guard
+ * against incorrect usage and accidently transferring references.
+ * continue_at() and closure_return() clear it for you, if you're doing
+ * something unusual you can use closure_set_dead() which also helps
+ * annotate where references are being transferred.
+ *
+ * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
+ * waiting asynchronously
+ *
+ * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
+ * closure with this flag set
+ *
+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+ * the thread that owns the closure, and cleared by the thread that's
+ * waking up the closure.
+ *
+ * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
+ * - indicates that cl->task is valid and closure_put() may wake it up.
+ * Only set or cleared by the thread that owns the closure.
+ *
+ * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
+ * has an outstanding timer. Must be set by the thread that owns the
+ * closure, and cleared by the timer function when the timer goes off.
+ */
+
+#define CLOSURE_RUNNING (1 << 21)
+#define CLOSURE_BLOCKING (1 << 23)
+#define CLOSURE_STACK (1 << 25)
+#define CLOSURE_WAITING (1 << 27)
+#define CLOSURE_SLEEPING (1 << 29)
+#define CLOSURE_TIMER (1 << 31)
+ atomic_t remaining;
+
+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
+
+#define TYPE_closure 0U
+#define TYPE_closure_with_waitlist 1U
+#define TYPE_closure_with_timer 2U
+#define TYPE_closure_with_waitlist_and_timer 3U
+#define MAX_CLOSURE_TYPE 3U
+ unsigned type;
+
+#ifdef CONFIG_DEBUG_CLOSURES
+#define CLOSURE_MAGIC_DEAD 0xc1054e0dead
+#define CLOSURE_MAGIC_ALIVE 0xc1054e0a11e
+
+ unsigned long magic;
+ struct list_head all;
+ unsigned long ip;
+ unsigned long waiting_on;
+#endif
+};
+
+struct closure_with_waitlist {
+ struct closure cl;
+ closure_list_t wait;
+};
+
+struct closure_with_timer {
+ struct closure cl;
+ struct timer_list timer;
+};
+
+struct closure_with_waitlist_and_timer {
+ struct closure cl;
+ closure_list_t wait;
+ struct timer_list timer;
+};
+
+extern unsigned invalid_closure_type(void);
+
+#define __CL_TYPE(cl, _t) \
+ __builtin_types_compatible_p(typeof(cl), struct _t) \
+ ? TYPE_ ## _t : \
+
+#define __closure_type(cl) \
+( \
+ __CL_TYPE(cl, closure) \
+ __CL_TYPE(cl, closure_with_waitlist) \
+ __CL_TYPE(cl, closure_with_timer) \
+ __CL_TYPE(cl, closure_with_waitlist_and_timer) \
+ invalid_closure_type() \
+)
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void closure_queue(struct closure *cl);
+void __closure_wake_up(closure_list_t *list);
+bool closure_wait(closure_list_t *list, struct closure *cl);
+void closure_sync(struct closure *cl);
+
+bool closure_trylock(struct closure *cl, struct closure *parent);
+void __closure_lock(struct closure *cl, struct closure *parent,
+ closure_list_t *wait_list);
+
+void do_closure_timer_init(struct closure *cl);
+bool __closure_sleep(struct closure *cl, unsigned long delay,
+ struct timer_list *timer);
+void __closure_flush(struct closure *cl, struct timer_list *timer);
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ BUG_ON((atomic_inc_return(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) <= 1);
+#else
+ atomic_inc(&cl->remaining);
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+ atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline bool closure_is_stopped(struct closure *cl)
+{
+ return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
+}
+
+static inline bool closure_is_unlocked(struct closure *cl)
+{
+ return atomic_read(&cl->remaining) == -1;
+}
+
+static inline void do_closure_init(struct closure *cl, struct closure *parent,
+ bool running)
+{
+ switch (cl->type) {
+ case TYPE_closure_with_timer:
+ case TYPE_closure_with_waitlist_and_timer:
+ do_closure_timer_init(cl);
+ }
+
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_DEBUG_OBJECTS_WORK)
+ INIT_WORK(&cl->work, NULL);
+#endif
+ cl->parent = parent;
+ if (parent)
+ closure_get(parent);
+
+ if (running) {
+ closure_debug_create(cl);
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+ } else
+ atomic_set(&cl->remaining, -1);
+}
+
+/*
+ * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
+ * the result of __closure_type() is thrown away, it's used merely for type
+ * checking.
+ */
+#define __to_internal_closure(cl) \
+({ \
+ BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \
+ (struct closure *) cl; \
+})
+
+#define closure_init_type(cl, parent, running, memset) \
+do { \
+ struct closure *_cl = __to_internal_closure(cl); \
+ _cl->type = __closure_type(*(cl)); \
+ closure_set_ip(_cl); \
+ do_closure_init(_cl, parent, running); \
+} while (0)
+
+/**
+ * __closure_init() - Initialize a closure, skipping the memset()
+ *
+ * May be used instead of closure_init() when memory has already been zeroed.
+ */
+#define __closure_init(cl, parent) \
+ closure_init_type(cl, parent, true, false)
+
+/**
+ * closure_init() - Initialize a closure, setting the refcount to 1
+ * @cl: closure to initialize
+ * @parent: parent of the new closure. cl will take a refcount on it for its
+ * lifetime; may be NULL.
+ */
+#define closure_init(cl, parent) \
+ closure_init_type(cl, parent, true, true)
+
+static inline void closure_init_stack(struct closure *cl)
+{
+ memset(cl, 0, sizeof(struct closure));
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
+ CLOSURE_BLOCKING|CLOSURE_STACK);
+}
+
+/**
+ * closure_init_unlocked() - Initialize a closure but leave it unlocked.
+ * @cl: closure to initialize
+ *
+ * For when the closure will be used as a lock. The closure may not be used
+ * until after a closure_lock() or closure_trylock().
+ */
+#define closure_init_unlocked(cl) \
+ closure_init_type(cl, NULL, false, true)
+
+/**
+ * closure_lock() - lock and initialize a closure.
+ * @cl: the closure to lock
+ * @parent: the new parent for this closure
+ *
+ * The closure must be of one of the types that has a waitlist (otherwise we
+ * wouldn't be able to sleep on contention).
+ *
+ * @parent has exactly the same meaning as in closure_init(); if non null, the
+ * closure will take a reference on @parent which will be released when it is
+ * unlocked.
+ */
+#define closure_lock(cl, parent) \
+ __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
+
+/**
+ * closure_sleep() - asynchronous sleep
+ * @cl: the closure that will sleep
+ * @delay: the delay in jiffies
+ *
+ * Takes a refcount on @cl which will be released after @delay jiffies; this may
+ * be used to have a function run after a delay with continue_at(), or
+ * closure_sync() may be used for a convoluted version of msleep().
+ */
+#define closure_sleep(cl, delay) \
+ __closure_sleep(__to_internal_closure(cl), delay, &(cl)->timer)
+
+#define closure_flush(cl) \
+ __closure_flush(__to_internal_closure(cl), &(cl)->timer)
+
+#define closure_flush_sync(cl) \
+ __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
+
+static inline void __closure_end_sleep(struct closure *cl)
+{
+ __set_current_state(TASK_RUNNING);
+
+ if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+ atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void __closure_start_sleep(struct closure *cl)
+{
+ closure_set_ip(cl);
+ cl->task = current;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+ atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+/**
+ * closure_blocking() - returns true if the closure is in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ */
+static inline bool closure_blocking(struct closure *cl)
+{
+ return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
+}
+
+/**
+ * set_closure_blocking() - put a closure in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ *
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void set_closure_blocking(struct closure *cl)
+{
+ if (!closure_blocking(cl))
+ atomic_add(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/*
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void clear_closure_blocking(struct closure *cl)
+{
+ if (closure_blocking(cl))
+ atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/**
+ * closure_wake_up() - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(closure_list_t *list)
+{
+ smp_mb();
+ __closure_wake_up(list);
+}
+
+/*
+ * Wait on an event, synchronously or asynchronously - analagous to wait_event()
+ * but for closures.
+ *
+ * The loop is oddly structured so as to avoid a race; we must check the
+ * condition again after we've added ourself to the waitlist. We know if we were
+ * already on the waitlist because closure_wait() returns false; thus, we only
+ * schedule or break if closure_wait() returns false. If it returns true, we
+ * just loop again - rechecking the condition.
+ *
+ * The __closure_wake_up() is necessary because we may race with the event
+ * becoming true; i.e. we see event false -> wait -> recheck condition, but the
+ * thread that made the event true may have called closure_wake_up() before we
+ * added ourself to the wait list.
+ *
+ * We have to call closure_sync() at the end instead of just
+ * __closure_end_sleep() because a different thread might've called
+ * closure_wake_up() before us and gotten preempted before they dropped the
+ * refcount on our closure. If this was a stack allocated closure, that would be
+ * bad.
+ */
+#define __closure_wait_event(list, cl, condition, _block) \
+({ \
+ __label__ out; \
+ bool block = _block; \
+ typeof(condition) ret; \
+ \
+ while (!(ret = (condition))) { \
+ if (block) \
+ __closure_start_sleep(cl); \
+ if (!closure_wait(list, cl)) { \
+ if (!block) \
+ goto out; \
+ schedule(); \
+ } \
+ } \
+ __closure_wake_up(list); \
+ if (block) \
+ closure_sync(cl); \
+out: \
+ ret; \
+})
+
+/**
+ * closure_wait_event() - wait on a condition, synchronously or asynchronously.
+ * @list: the wait list to wait on
+ * @cl: the closure that is doing the waiting
+ * @condition: a C expression for the event to wait for
+ *
+ * If the closure is in blocking mode, sleeps until the @condition evaluates to
+ * true - exactly like wait_event().
+ *
+ * If the closure is not in blocking mode, waits asynchronously; if the
+ * condition is currently false the @cl is put onto @list and returns. @list
+ * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
+ * wait for another thread to wake up @list, which drops the refcount on @cl.
+ *
+ * Returns the value of @condition; @cl will be on @list iff @condition was
+ * false.
+ *
+ * closure_wake_up(@list) must be called after changing any variable that could
+ * cause @condition to become true.
+ */
+#define closure_wait_event(list, cl, condition) \
+ __closure_wait_event(list, cl, condition, closure_blocking(cl))
+
+#define closure_wait_event_async(list, cl, condition) \
+ __closure_wait_event(list, cl, condition, false)
+
+#define closure_wait_event_sync(list, cl, condition) \
+ __closure_wait_event(list, cl, condition, true)
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+ struct workqueue_struct *wq)
+{
+ cl->fn = fn;
+ cl->wq = wq;
+ /* between atomic_dec() in closure_put() */
+ smp_mb__before_atomic_dec();
+}
+
+#define continue_at(_cl, _fn, _wq, ...) \
+do { \
+ BUG_ON(!(_cl) || object_is_on_stack(_cl)); \
+ closure_set_ip(_cl); \
+ set_closure_fn(_cl, _fn, _wq); \
+ closure_sub(_cl, CLOSURE_RUNNING + 1); \
+ return __VA_ARGS__; \
+} while (0)
+
+#define closure_return(_cl) continue_at((_cl), NULL, NULL)
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 028aba9..87abaef 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -316,4 +316,7 @@ config SIGNATURE
Digital signature verification. Currently only RSA is supported.
Implementation is done using GnuPG MPI library
+config CLOSURES
+ bool
+
endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8745ac7..ba9ede8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -374,6 +374,15 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
help
Debug objects boot parameter default value
+config DEBUG_CLOSURES
+ bool "Debug closures"
+ depends on CLOSURES
+ select DEBUG_FS
+ ---help---
+ Keeps all active closures in a linked list and provides a debugfs
+ interface to list them, which makes it possible to see asynchronous
+ operations that get stuck.
+
config DEBUG_SLAB
bool "Debug slab memory allocations"
depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..1e9354e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -123,6 +123,8 @@ obj-$(CONFIG_SIGNATURE) += digsig.o
obj-$(CONFIG_CLZ_TAB) += clz_tab.o
+obj-$(CONFIG_CLOSURES) += closure.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h
diff --git a/lib/closure.c b/lib/closure.c
new file mode 100644
index 0000000..5e9fd98
--- /dev/null
+++ b/lib/closure.c
@@ -0,0 +1,363 @@
+
+#include <linux/closure.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+/*
+ * Closure like things
+ * See include/linux/closure.h for full documentation
+ */
+
+void closure_queue(struct closure *cl)
+{
+ struct workqueue_struct *wq = cl->wq;
+ if (wq) {
+ cl->work.data = (atomic_long_t) WORK_DATA_INIT();
+ INIT_LIST_HEAD(&cl->work.entry);
+ BUG_ON(!queue_work(wq, &cl->work));
+ } else
+ cl->fn(cl);
+}
+EXPORT_SYMBOL_GPL(closure_queue);
+
+static void closure_wake_up_after_xchg(struct llist_node *);
+
+#define CL_FIELD(type, field) \
+ case TYPE_ ## type: \
+ return &container_of(cl, struct type, cl)->field
+
+static closure_list_t *closure_waitlist(struct closure *cl)
+{
+ switch (cl->type) {
+ CL_FIELD(closure_with_waitlist, wait);
+ CL_FIELD(closure_with_waitlist_and_timer, wait);
+ }
+ return NULL;
+}
+
+static struct timer_list *closure_timer(struct closure *cl)
+{
+ switch (cl->type) {
+ CL_FIELD(closure_with_timer, timer);
+ CL_FIELD(closure_with_waitlist_and_timer, timer);
+ }
+ return NULL;
+}
+
+static void closure_put_after_sub(struct closure *cl, int r)
+{
+ BUG_ON(r & CLOSURE_GUARD_MASK);
+ /* CLOSURE_BLOCK is the only flag that's allowed when r hits 0 */
+ BUG_ON((r & CLOSURE_REMAINING_MASK) == 0 &&
+ (r & ~CLOSURE_BLOCKING));
+
+ /* Must deliver precisely one wakeup */
+ if ((r & CLOSURE_REMAINING_MASK) == 1 &&
+ (r & CLOSURE_SLEEPING)) {
+ smp_mb__after_atomic_dec();
+ wake_up_process(cl->task);
+ }
+
+ if ((r & CLOSURE_REMAINING_MASK) == 0) {
+ smp_mb__after_atomic_dec();
+
+ if (cl->fn) {
+ /* CLOSURE_BLOCKING might be set - clear it */
+ atomic_set(&cl->remaining,
+ CLOSURE_REMAINING_INITIALIZER);
+ closure_queue(cl);
+ } else {
+ struct closure *parent = cl->parent;
+ closure_list_t *wait = closure_waitlist(cl);
+
+ closure_debug_destroy(cl);
+
+ smp_wmb();
+ /* mb between last use of closure and unlocking it */
+ atomic_set(&cl->remaining, -1);
+
+ if (wait)
+ closure_wake_up(wait);
+
+ if (parent)
+ closure_put(parent);
+ }
+ }
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_sub);
+
+void closure_put(struct closure *cl)
+{
+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_put);
+
+static void set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+ cl->waiting_on = f;
+#endif
+}
+
+/*
+ * Broken up because closure_put() has to do the xchg() and grab the wait list
+ * before unlocking the closure, but the wakeup has to come after unlocking the
+ * closure.
+ */
+static void closure_wake_up_after_xchg(struct llist_node *list)
+{
+ struct closure *cl;
+ struct llist_node *reverse = NULL;
+
+ while (list) {
+ struct llist_node *t = list;
+ list = llist_next(list);
+
+ t->next = reverse;
+ reverse = t;
+ }
+
+ while (reverse) {
+ cl = container_of(reverse, struct closure, list);
+ reverse = llist_next(reverse);
+
+ set_waiting(cl, 0);
+ closure_sub(cl, CLOSURE_WAITING + 1);
+ }
+}
+
+void __closure_wake_up(closure_list_t *list)
+{
+ closure_wake_up_after_xchg(llist_del_all(list));
+}
+EXPORT_SYMBOL_GPL(__closure_wake_up);
+
+bool closure_wait(closure_list_t *list, struct closure *cl)
+{
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+ return false;
+
+ set_waiting(cl, _RET_IP_);
+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+ llist_add(&cl->list, list);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(closure_wait);
+
+/**
+ * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+void closure_sync(struct closure *cl)
+{
+ while (1) {
+ __closure_start_sleep(cl);
+ closure_set_ret_ip(cl);
+
+ if ((atomic_read(&cl->remaining) &
+ CLOSURE_REMAINING_MASK) == 1)
+ break;
+
+ schedule();
+ }
+
+ __closure_end_sleep(cl);
+}
+EXPORT_SYMBOL_GPL(closure_sync);
+
+/**
+ * closure_trylock() - try to acquire the closure, without waiting
+ * @cl: closure to lock
+ *
+ * Returns true if the closure was succesfully locked.
+ */
+bool closure_trylock(struct closure *cl, struct closure *parent)
+{
+ if (atomic_cmpxchg(&cl->remaining, -1,
+ CLOSURE_REMAINING_INITIALIZER) != -1)
+ return false;
+
+ closure_set_ret_ip(cl);
+
+ smp_mb();
+ cl->parent = parent;
+ if (parent)
+ closure_get(parent);
+
+ closure_debug_create(cl);
+ return true;
+}
+EXPORT_SYMBOL_GPL(closure_trylock);
+
+void __closure_lock(struct closure *cl, struct closure *parent,
+ closure_list_t *wait_list)
+{
+ struct closure wait;
+ closure_init_stack(&wait);
+
+ while (1) {
+ if (closure_trylock(cl, parent))
+ return;
+
+ closure_wait_event_sync(wait_list, &wait,
+ atomic_read(&cl->remaining) == -1);
+ }
+}
+EXPORT_SYMBOL_GPL(__closure_lock);
+
+static void closure_sleep_timer_fn(unsigned long data)
+{
+ struct closure *cl = (struct closure *) data;
+ closure_sub(cl, CLOSURE_TIMER + 1);
+}
+
+void do_closure_timer_init(struct closure *cl)
+{
+ struct timer_list *timer = closure_timer(cl);
+
+ init_timer(timer);
+ timer->data = (unsigned long) cl;
+ timer->function = closure_sleep_timer_fn;
+}
+EXPORT_SYMBOL_GPL(do_closure_timer_init);
+
+bool __closure_sleep(struct closure *cl, unsigned long delay,
+ struct timer_list *timer)
+{
+ if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
+ return false;
+
+ BUG_ON(timer_pending(timer));
+
+ timer->expires = jiffies + delay;
+
+ atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
+ add_timer(timer);
+ return true;
+}
+EXPORT_SYMBOL_GPL(__closure_sleep);
+
+void __closure_flush(struct closure *cl, struct timer_list *timer)
+{
+ if (del_timer(timer))
+ closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush);
+
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
+{
+ if (del_timer_sync(timer))
+ closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush_sync);
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+ unsigned long flags;
+
+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+ cl->magic = CLOSURE_MAGIC_ALIVE;
+
+ spin_lock_irqsave(&closure_list_lock, flags);
+ list_add(&cl->all, &closure_list);
+ spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+ unsigned long flags;
+
+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+ cl->magic = CLOSURE_MAGIC_DEAD;
+
+ spin_lock_irqsave(&closure_list_lock, flags);
+ list_del(&cl->all);
+ spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+ struct closure *cl;
+ spin_lock_irq(&closure_list_lock);
+
+ list_for_each_entry(cl, &closure_list, all) {
+ int r = atomic_read(&cl->remaining);
+
+ seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+ cl, (void *) cl->ip, cl->fn, cl->parent,
+ r & CLOSURE_REMAINING_MASK);
+
+ if (test_bit(WORK_STRUCT_PENDING, work_data_bits(&cl->work)))
+ seq_printf(f, "Q");
+
+ if (r & CLOSURE_RUNNING)
+ seq_printf(f, "R");
+
+ if (r & CLOSURE_BLOCKING)
+ seq_printf(f, "B");
+
+ if (r & CLOSURE_STACK)
+ seq_printf(f, "S");
+
+ if (r & CLOSURE_SLEEPING)
+ seq_printf(f, "Sl");
+
+ if (r & CLOSURE_TIMER)
+ seq_printf(f, "T");
+
+ if (r & CLOSURE_WAITING)
+ seq_printf(f, " W %pF\n",
+ (void *) cl->waiting_on);
+
+ seq_printf(f, "\n");
+ }
+
+ spin_unlock_irq(&closure_list_lock);
+ return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+ .owner = THIS_MODULE,
+ .open = debug_seq_open,
+ .read = seq_read,
+ .release = single_release
+};
+
+int __init closure_debug_init(void)
+{
+ debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+ return 0;
+}
+
+module_init(closure_debug_init);
+
+#endif
+
+MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+MODULE_LICENSE("GPL");
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 08/16] bcache: Documentation, and changes to generic code
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (2 preceding siblings ...)
2012-05-10 3:09 ` [Bcache v13 07/16] Closures Kent Overstreet
@ 2012-05-10 3:10 ` Kent Overstreet
2012-05-10 3:10 ` [Bcache v13 09/16] Bcache: generic utility code Kent Overstreet
` (9 subsequent siblings)
13 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:10 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
Documentation/ABI/testing/sysfs-block-bcache | 156 ++++++++++++++++
Documentation/bcache.txt | 255 +++++++++++++++++++++++++
drivers/block/Kconfig | 2 +
drivers/block/Makefile | 1 +
drivers/block/bcache/Kconfig | 42 +++++
drivers/block/bcache/Makefile | 8 +
include/linux/cgroup_subsys.h | 6 +
include/linux/sched.h | 4 +
include/trace/events/bcache.h | 257 ++++++++++++++++++++++++++
kernel/fork.c | 4 +
10 files changed, 735 insertions(+), 0 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-block-bcache
create mode 100644 Documentation/bcache.txt
create mode 100644 drivers/block/bcache/Kconfig
create mode 100644 drivers/block/bcache/Makefile
create mode 100644 include/trace/events/bcache.h
diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache
new file mode 100644
index 0000000..9e4bbc5
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-block-bcache
@@ -0,0 +1,156 @@
+What: /sys/block/<disk>/bcache/unregister
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ A write to this file causes the backing device or cache to be
+ unregistered. If a backing device had dirty data in the cache,
+ writeback mode is automatically disabled and all dirty data is
+ flushed before the device is unregistered. Caches unregister
+ all associated backing devices before unregistering themselves.
+
+What: /sys/block/<disk>/bcache/clear_stats
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ Writing to this file resets all the statistics for the device.
+
+What: /sys/block/<disk>/bcache/cache
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a backing device that has cache, a symlink to
+ the bcache/ dir of that cache.
+
+What: /sys/block/<disk>/bcache/cache_hits
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: integer number of full cache hits,
+ counted per bio. A partial cache hit counts as a miss.
+
+What: /sys/block/<disk>/bcache/cache_misses
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: integer number of cache misses.
+
+What: /sys/block/<disk>/bcache/cache_hit_ratio
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: cache hits as a percentage.
+
+What: /sys/block/<disk>/bcache/sequential_cutoff
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: Threshold past which sequential IO will
+ skip the cache. Read and written as bytes in human readable
+ units (i.e. echo 10M > sequntial_cutoff).
+
+What: /sys/block/<disk>/bcache/bypassed
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ Sum of all reads and writes that have bypassed the cache (due
+ to the sequential cutoff). Expressed as bytes in human
+ readable units.
+
+What: /sys/block/<disk>/bcache/writeback
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: When on, writeback caching is enabled and
+ writes will be buffered in the cache. When off, caching is in
+ writethrough mode; reads and writes will be added to the
+ cache but no write buffering will take place.
+
+What: /sys/block/<disk>/bcache/writeback_running
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: when off, dirty data will not be written
+ from the cache to the backing device. The cache will still be
+ used to buffer writes until it is mostly full, at which point
+ writes transparently revert to writethrough mode. Intended only
+ for benchmarking/testing.
+
+What: /sys/block/<disk>/bcache/writeback_delay
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: In writeback mode, when dirty data is
+ written to the cache and the cache held no dirty data for that
+ backing device, writeback from cache to backing device starts
+ after this delay, expressed as an integer number of seconds.
+
+What: /sys/block/<disk>/bcache/writeback_percent
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For backing devices: If nonzero, writeback from cache to
+ backing device only takes place when more than this percentage
+ of the cache is used, allowing more write coalescing to take
+ place and reducing total number of writes sent to the backing
+ device. Integer between 0 and 40.
+
+What: /sys/block/<disk>/bcache/synchronous
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, a boolean that allows synchronous mode to be
+ switched on and off. In synchronous mode all writes are ordered
+ such that the cache can reliably recover from unclean shutdown;
+ if disabled bcache will not generally wait for writes to
+ complete but if the cache is not shut down cleanly all data
+ will be discarded from the cache. Should not be turned off with
+ writeback caching enabled.
+
+What: /sys/block/<disk>/bcache/discard
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, a boolean allowing discard/TRIM to be turned off
+ or back on if the device supports it.
+
+What: /sys/block/<disk>/bcache/bucket_size
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, bucket size in human readable units, as set at
+ cache creation time; should match the erase block size of the
+ SSD for optimal performance.
+
+What: /sys/block/<disk>/bcache/nbuckets
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, the number of usable buckets.
+
+What: /sys/block/<disk>/bcache/tree_depth
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, height of the btree excluding leaf nodes (i.e. a
+ one node tree will have a depth of 0).
+
+What: /sys/block/<disk>/bcache/btree_cache_size
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ Number of btree buckets/nodes that are currently cached in
+ memory; cache dynamically grows and shrinks in response to
+ memory pressure from the rest of the system.
+
+What: /sys/block/<disk>/bcache/written
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, total amount of data in human readable units
+ written to the cache, excluding all metadata.
+
+What: /sys/block/<disk>/bcache/btree_written
+Date: November 2010
+Contact: Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+ For a cache, sum of all btree writes in human readable units.
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
new file mode 100644
index 0000000..270c734
--- /dev/null
+++ b/Documentation/bcache.txt
@@ -0,0 +1,255 @@
+Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be
+nice if you could use them as cache... Hence bcache.
+
+Userspace tools and a wiki are at:
+ git://evilpiepirate.org/~kent/bcache-tools.git
+ http://bcache.evilpiepirate.org
+
+It's designed around the performance characteristics of SSDs - it only allocates
+in erase block sized buckets, and it uses a hybrid btree/log to track cached
+extants (which can be anywhere from a single sector to the bucket size). It's
+designed to avoid random writes at all costs; it fills up an erase block
+sequentially, then issues a discard before reusing it.
+
+Both writethrough and writeback caching are supported. Writeback defaults to
+off, but can be switched on and off arbitrarily at runtime. Bcache goes to
+great lengths to protect your data - it reliably handles unclean shutdown. (It
+doesn't even have a notion of a clean shutdown; bcache simply doesn't return
+writes as completed until they're on stable storage).
+
+Writeback caching can use most of the cache for buffering writes - writing
+dirty data to the backing device is always done sequentially, scanning from the
+start to the end of the index.
+
+Since random IO is what SSDs excel at, there generally won't be much benefit
+to caching large sequential IO. Bcache detects sequential IO and skips it;
+it also keeps a rolling average of the IO sizes per task, and as long as the
+average is above the cutoff it will skip all IO from that task - instead of
+caching the first 512k after every seek. Backups and large file copies should
+thus entirely bypass the cache.
+
+In the event of a data IO error on the flash it will try to recover by reading
+from disk or invalidating cache entries. For unrecoverable errors (meta data
+or dirty data), caching is automatically disabled; if dirty data was present
+in the cache it first disables writeback caching and waits for all dirty data
+to be flushed.
+
+Getting started:
+You'll need make-bcache from the bcache-tools repository. Both the cache device
+and backing device must be formatted before use.
+ make-bcache -B /dev/sdb
+ make-bcache -C -w2k -b1M -j64 /dev/sdc
+
+To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
+ echo /dev/sdb > /sys/fs/bcache/register
+ echo /dev/sdc > /sys/fs/bcache/register
+
+To register your bcache devices automatically, you could add something like
+this to an init script:
+ echo /dev/sd* > /sys/fs/bcache/register_quiet
+
+It'll look for bcache superblocks and ignore everything that doesn't have one.
+
+When you register a backing device, you'll get a new /dev/bcache# device:
+ mkfs.ext4 /dev/bcache0
+ mount /dev/bcache0 /mnt
+
+Cache devices are managed as sets; multiple caches per set isn't supported yet
+but will allow for mirroring of metadata and dirty data in the future. Your new
+cache set shows up as /sys/fs/bcache/<UUID>
+
+To enable caching, you need to attach the backing device to the cache set by
+specifying the UUID:
+ echo <UUID> > /sys/block/sdb/bcache/attach
+
+The cache set with that UUID need not be registered to attach to it - the UUID
+will be saved to the backing device's superblock and it'll start being cached
+when the cache set does show up.
+
+This only has to be done once. The next time you reboot, just reregister all
+your bcache devices. If a backing device has data in a cache somewhere, the
+/dev/bcache# device won't be created until the cache shows up - particularly
+important if you have writeback caching turned on.
+
+If you're booting up and your cache device is gone and never coming back, you
+can force run the backing device:
+ echo 1 > /sys/block/sdb/bcache/running
+
+The backing device will still use that cache set if it shows up in the future,
+but all the cached data will be invalidated. If there was dirty data in the
+cache, don't expect the filesystem to be recoverable - you will have massive
+filesystem corruption, though ext4's fsck does work miracles.
+
+
+Other sysfs files for the backing device:
+
+ bypassed
+ Sum of all IO, reads and writes, than have bypassed the cache
+
+ cache_hits
+ cache_misses
+ cache_hit_ratio
+ Hits and misses are counted per individual IO as bcache sees them; a
+ partial hit is counted as a miss.
+
+ cache_miss_collisions
+ Count of times a read completes but the data is already in the cache and
+ is therefore redundant. This is usually caused by readahead while a
+ read to the same location occurs.
+
+ cache_readaheads
+ Count of times readahead occured.
+
+ clear_stats
+ Writing to this file resets all the statistics.
+
+ flush_delay_ms
+ flush_delay_ms_sync
+ Optional delay for btree writes to allow for more coalescing of updates to
+ the index. Default to 0.
+
+ label
+ Name of underlying device.
+
+ readahead
+ Size of readahead that should be performed. Defaults to 0. If set to e.g.
+ 1M, it will round cache miss reads up to that size, but without overlapping
+ existing cache entries.
+
+ running
+ 1 if bcache is running.
+
+ sequential_cutoff
+ A sequential IO will bypass the cache once it passes this threshhold; the
+ most recent 128 IOs are tracked so sequential IO can be detected even when
+ it isn't all done at once.
+
+ sequential_cutoff_average
+ If the weighted average from a client is higher than this cutoff we bypass
+ all IO.
+
+ unregister
+ Writing to this file disables caching on that device
+
+ writeback
+ Boolean, if off only writethrough caching is done
+
+ writeback_delay
+ When dirty data is written to the cache and it previously did not contain
+ any, waits some number of seconds before initiating writeback. Defaults to
+ 30.
+
+ writeback_percent
+ To allow for more buffering of random writes, writeback only proceeds when
+ more than this percentage of the cache is unavailable. Defaults to 0.
+
+ writeback_running
+ If off, writeback of dirty data will not take place at all. Dirty data will
+ still be added to the cache until it is mostly full; only meant for
+ benchmarking. Defaults to on.
+
+For the cache set:
+ active_journal_entries
+ Number of journal entries that are newer than the index.
+
+ average_key_size
+ Average data per key in the btree.
+
+ average_seconds_between_gc
+ How often garbage collection is occuring.
+
+ block_size
+ Block size of the virtual device.
+
+ btree_avg_keys_written
+ Average number of keys per write to the btree when a node wasn't being
+ rewritten - indicates how much coalescing is taking place.
+
+
+ btree_cache_size
+ Number of btree buckets currently cached in memory
+
+ btree_nodes
+ Total nodes in the btree.
+
+ btree_used_percent
+ Average fraction of btree in use.
+
+ bucket_size
+ Size of Buckets
+
+ bypassed
+ Sum of all IO, reads and writes, than have bypassed the cache
+
+ cache_available_percent
+ Percentage of cache device free.
+
+ clear_stats
+ Clears the statistics associated with this cache
+
+ dirty_data
+ How much dirty data is in the cache.
+
+ gc_ms_max
+ Longest garbage collection.
+
+ internal/bset_tree_stats
+ internal/btree_cache_max_chain
+ Internal. Statistics about the bset tree and chain length. Likely to be
+ hidden soon.
+
+ io_error_halflife
+ io_error_limit
+ These determines how many errors we accept before disabling the cache.
+ Each error is decayed by the half life (in # ios). If the decaying count
+ reaches io_error_limit dirty data is written out and the cache is disabled.
+
+ root_usage_percent
+ Percentage of the root btree node in use. If this gets too high the node
+ will split, increasing the tree depth.
+
+ seconds_since_gc
+ When was the last garbage collection.
+
+ synchronous
+ Boolean; when on all writes to the cache are strictly ordered such that it
+ can recover from unclean shutdown. If off it will not generally wait for
+ writes to complete, but the entire cache contents will be invalidated on
+ unclean shutdown. Not recommended that it be turned off when writeback is
+ on.
+
+ tree_depth
+ Depth of the btree.
+
+ trigger_gc
+ Force garbage collection to run now.
+
+ unregister
+ Closes the cache device and all devices being cached; if dirty data is
+ present it will disable writeback caching and wait for it to be flushed.
+
+
+For each cache within a cache set:
+ btree_written
+ Sum of all btree writes, in (kilo/mega/giga) bytes
+
+ discard
+ Boolean; if on a discard/TRIM will be issued to each bucket before it is
+ reused. Defaults to on if supported.
+
+ io_errors
+ Number of errors that have occured, decayed by io_error_halflife.
+
+ metadata_written
+ Total Metadata written (btree + other meta data).
+
+ nbuckets
+ Total buckets in this cache
+
+ priority_stats
+ Statistics about how recently data in the cache has been accessed. This can
+ reveal your working set size.
+
+ written
+ Sum of all data that has been written to the cache; comparison with
+ btree_written gives the amount of write inflation in bcache.
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 4e4c8a4..d872600 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -526,6 +526,8 @@ config VIRTIO_BLK
This is the virtual block driver for virtio. It can be used with
lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
+source "drivers/block/bcache/Kconfig"
+
config BLK_DEV_HD
bool "Very old hard disk (MFM/RLL/IDE) driver"
depends on HAVE_IDE
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 5b79505..19520e1 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
+obj-$(CONFIG_BCACHE) += bcache/
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/bcache/Kconfig b/drivers/block/bcache/Kconfig
new file mode 100644
index 0000000..019f133
--- /dev/null
+++ b/drivers/block/bcache/Kconfig
@@ -0,0 +1,42 @@
+
+config BCACHE
+ tristate "Block device as cache"
+ select COMPACTION
+ select CLOSURES
+ ---help---
+ Allows a block device to be used as cache for other devices; uses
+ a btree for indexing and the layout is optimized for SSDs.
+
+ See Documentation/bcache.txt for details.
+
+config BCACHE_DEBUG
+ bool "Bcache debugging"
+ depends on BCACHE
+ ---help---
+ Don't select this option unless you're a developer
+
+ Enables extra debugging tools (primarily a fuzz tester)
+
+config BCACHE_EDEBUG
+ bool "Extended runtime checks"
+ depends on BCACHE
+ ---help---
+ Don't select this option unless you're a developer
+
+ Enables extra runtime checks which significantly affect performance
+
+config BCACHE_LATENCY_DEBUG
+ bool "Latency tracing for bcache"
+ depends on BCACHE
+ ---help---
+ Hacky latency tracing that has nevertheless been useful in the past:
+ adds a global variable accessible via /sys/fs/bcache/latency_warn_ms,
+ which defaults to 0. If nonzero, any timed operation that takes longer
+ emits a printk.
+
+config CGROUP_BCACHE
+ bool "Cgroup controls for bcache"
+ depends on BCACHE && BLK_CGROUP
+ ---help---
+ TODO
+
diff --git a/drivers/block/bcache/Makefile b/drivers/block/bcache/Makefile
new file mode 100644
index 0000000..84302f9
--- /dev/null
+++ b/drivers/block/bcache/Makefile
@@ -0,0 +1,8 @@
+
+ccflags-y += -std=gnu99
+obj-$(CONFIG_BCACHE) += bcache.o
+
+bcache-y := alloc.o btree.o bset.o io.o journal.o\
+ writeback.o request.o super.o debug.o util.o trace.o stats.o
+
+CFLAGS_request.o += -Iblock
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0bd390c..d698634 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -72,3 +72,9 @@ SUBSYS(net_prio)
#endif
/* */
+
+#ifdef CONFIG_CGROUP_BCACHE
+SUBSYS(bcache)
+#endif
+
+/* */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0657368..63014ba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1590,6 +1590,10 @@ struct task_struct {
#ifdef CONFIG_HAVE_HW_BREAKPOINT
atomic_t ptrace_bp_refcnt;
#endif
+#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
+ unsigned int sequential_io;
+ unsigned int sequential_io_avg;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
new file mode 100644
index 0000000..229ffe1
--- /dev/null
+++ b/include/trace/events/bcache.h
@@ -0,0 +1,257 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcache
+
+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHE_H
+
+#include <linux/tracepoint.h>
+
+struct btree_op;
+
+DECLARE_EVENT_CLASS(bcache_request,
+
+ TP_PROTO(struct btree_op *op, struct bio *bio),
+
+ TP_ARGS(op, bio),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(unsigned int, orig_major )
+ __field(unsigned int, orig_minor )
+ __field(sector_t, sector )
+ __field(dev_t, orig_sector )
+ __field(unsigned int, nr_sector )
+ __array(char, rwbs, 6 )
+ __array(char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->orig_major = op->d->disk->major;
+ __entry->orig_minor = op->d->disk->first_minor;
+ __entry->sector = bio->bi_sector;
+ __entry->orig_sector = bio->bi_sector - 16;
+ __entry->nr_sector = bio->bi_size >> 9;
+ blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs,
+ (unsigned long long)__entry->sector,
+ __entry->nr_sector, __entry->comm,
+ __entry->orig_major, __entry->orig_minor,
+ (unsigned long long)__entry->orig_sector)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_start,
+
+ TP_PROTO(struct btree_op *op, struct bio* bio),
+
+ TP_ARGS(op, bio)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_end,
+
+ TP_PROTO(struct btree_op *op, struct bio* bio),
+
+ TP_ARGS(op, bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_bio,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(sector_t, sector )
+ __field(unsigned int, nr_sector )
+ __array(char, rwbs, 6 )
+ __array(char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs,
+ (unsigned long long)__entry->sector,
+ __entry->nr_sector, __entry->comm)
+);
+
+
+DEFINE_EVENT(bcache_bio, bcache_passthrough,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_cache_hit,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_cache_miss,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_retry,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_writethrough,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_writeback,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_skip,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_btree_read,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_btree_write,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_dirty,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_dirty,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_journal_write,
+
+ TP_PROTO(struct bio *bio),
+
+ TP_ARGS(bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_cache_bio,
+
+ TP_PROTO(struct bio *bio,
+ sector_t orig_sector,
+ struct block_device* orig_bdev),
+
+ TP_ARGS(bio, orig_sector, orig_bdev),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(dev_t, orig_dev )
+ __field(sector_t, sector )
+ __field(sector_t, orig_sector )
+ __field(unsigned int, nr_sector )
+ __array(char, rwbs, 6 )
+ __array(char, comm, TASK_COMM_LEN )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = bio->bi_bdev->bd_dev;
+ __entry->orig_dev = orig_bdev->bd_dev;
+ __entry->sector = bio->bi_sector;
+ __entry->orig_sector = orig_sector;
+ __entry->nr_sector = bio->bi_size >> 9;
+ blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+ memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rwbs,
+ (unsigned long long)__entry->sector,
+ __entry->nr_sector, __entry->comm,
+ MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
+ (unsigned long long)__entry->orig_sector)
+);
+
+DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
+
+ TP_PROTO(struct bio *bio,
+ sector_t orig_sector,
+ struct block_device *orig_bdev),
+
+ TP_ARGS(bio, orig_sector, orig_bdev)
+);
+
+DECLARE_EVENT_CLASS(bcache_gc,
+
+ TP_PROTO(uint8_t *uuid),
+
+ TP_ARGS(uuid),
+
+ TP_STRUCT__entry(
+ __field(uint8_t *, uuid)
+ ),
+
+ TP_fast_assign(
+ __entry->uuid = uuid;
+ ),
+
+ TP_printk("%pU", __entry->uuid)
+);
+
+
+DEFINE_EVENT(bcache_gc, bcache_gc_start,
+
+ TP_PROTO(uint8_t *uuid),
+
+ TP_ARGS(uuid)
+);
+
+DEFINE_EVENT(bcache_gc, bcache_gc_end,
+
+ TP_PROTO(uint8_t *uuid),
+
+ TP_ARGS(uuid)
+);
+
+#endif /* _TRACE_BCACHE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 26a7a67..9b83c81 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1255,6 +1255,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 09/16] Bcache: generic utility code
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (3 preceding siblings ...)
2012-05-10 3:10 ` [Bcache v13 08/16] bcache: Documentation, and changes to generic code Kent Overstreet
@ 2012-05-10 3:10 ` Kent Overstreet
[not found] ` <c3f0ca2a499f532253d4c16a30837d43e237266a.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:10 ` [Bcache v13 10/16] bcache: Superblock/initialization/sysfs code Kent Overstreet
` (8 subsequent siblings)
13 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:10 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Much of this code should be moved out of drivers/block/bcache, but it
was originally written for bcache.
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/util.c | 572 +++++++++++++++++++++++++++++++++++++
drivers/block/bcache/util.h | 657 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 1229 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/util.c
create mode 100644 drivers/block/bcache/util.h
diff --git a/drivers/block/bcache/util.c b/drivers/block/bcache/util.c
new file mode 100644
index 0000000..200c523
--- /dev/null
+++ b/drivers/block/bcache/util.c
@@ -0,0 +1,572 @@
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include <linux/dynamic_fault.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type) \
+int name ## _h(const char *cp, type *res) \
+{ \
+ int u = 0; \
+ char *e; \
+ type i = simple_ ## name(cp, &e, 10); \
+ \
+ switch (tolower(*e)) { \
+ default: \
+ return -EINVAL; \
+ case 'y': \
+ case 'z': \
+ u++; \
+ case 'e': \
+ u++; \
+ case 'p': \
+ u++; \
+ case 't': \
+ u++; \
+ case 'g': \
+ u++; \
+ case 'm': \
+ u++; \
+ case 'k': \
+ u++; \
+ if (e++ == cp) \
+ return -EINVAL; \
+ case '\n': \
+ case '\0': \
+ if (*e == '\n') \
+ e++; \
+ } \
+ \
+ if (*e) \
+ return -EINVAL; \
+ \
+ while (u--) { \
+ if ((type) ~0 > 0 && \
+ (type) ~0 / 1024 <= i) \
+ return -EINVAL; \
+ if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
+ (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
+ return -EINVAL; \
+ i *= 1024; \
+ } \
+ \
+ *res = i; \
+ return 0; \
+} \
+EXPORT_SYMBOL_GPL(name ## _h);
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t hprint(char *buf, int64_t v)
+{
+ static const char units[] = "?kMGTPEZY";
+ char dec[3] = "";
+ int u, t = 0;
+
+ for (u = 0; v >= 1024 || v <= -1024; u++) {
+ t = v & ~(~0 << 10);
+ v >>= 10;
+ }
+
+ if (!u)
+ return sprintf(buf, "%llu", v);
+
+ if (v < 100 && v > -100)
+ sprintf(dec, ".%i", t / 100);
+
+ return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+EXPORT_SYMBOL_GPL(hprint);
+
+ssize_t sprint_string_list(char *buf, const char * const list[],
+ size_t selected)
+{
+ char *out = buf;
+
+ for (size_t i = 0; list[i]; i++)
+ out += sprintf(out, i == selected ? "[%s] " : "%s ", list[i]);
+
+ out[-1] = '\n';
+ return out - buf;
+}
+EXPORT_SYMBOL_GPL(sprint_string_list);
+
+ssize_t read_string_list(const char *buf, const char * const list[])
+{
+ size_t i;
+ char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ s = strim(d);
+
+ for (i = 0; list[i]; i++)
+ if (!strcmp(list[i], s))
+ break;
+
+ kfree(d);
+
+ if (!list[i])
+ return -EINVAL;
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(read_string_list);
+
+bool is_zero(const char *p, size_t n)
+{
+ for (size_t i = 0; i < n; i++)
+ if (p[i])
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(is_zero);
+
+int parse_uuid(const char *s, char *uuid)
+{
+ size_t i, j, x;
+ memset(uuid, 0, 16);
+
+ for (i = 0, j = 0;
+ i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
+ i++) {
+ x = s[i] | 32;
+
+ switch (x) {
+ case '0'...'9':
+ x -= '0';
+ break;
+ case 'a'...'f':
+ x -= 'a' - 10;
+ break;
+ default:
+ continue;
+ }
+
+ if (!(j & 1))
+ x <<= 4;
+ uuid[j++ >> 1] |= x;
+ }
+ return i;
+}
+EXPORT_SYMBOL_GPL(parse_uuid);
+
+void time_stats_update(struct time_stats *stats, uint64_t start_time)
+{
+ uint64_t now = local_clock();
+ uint64_t duration = time_after64(now, start_time)
+ ? now - start_time : 0;
+ uint64_t last = time_after64(now, stats->last)
+ ? now - stats->last : 0;
+
+ stats->max_duration = max(stats->max_duration, duration);
+
+ if (stats->last) {
+ ewma_add(stats->average_duration, duration, 8, 8);
+
+ if (stats->average_frequency)
+ ewma_add(stats->average_frequency, last, 8, 8);
+ else
+ stats->average_frequency = last << 8;
+ } else
+ stats->average_duration = duration << 8;
+
+ stats->last = now ?: 1;
+}
+EXPORT_SYMBOL_GPL(time_stats_update);
+
+#ifdef CONFIG_BCACHE_LATENCY_DEBUG
+unsigned latency_warn_ms;
+#endif
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+static void check_bio(struct bio *bio)
+{
+ unsigned i, size = 0;
+ struct bio_vec *bv;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ BUG_ON(!bio->bi_vcnt);
+ BUG_ON(!bio->bi_size);
+
+ bio_for_each_segment(bv, bio, i)
+ size += bv->bv_len;
+
+ BUG_ON(size != bio->bi_size);
+ BUG_ON(size > queue_max_sectors(q) << 9);
+
+ blk_recount_segments(q, bio);
+ BUG_ON(bio->bi_phys_segments > queue_max_segments(q));
+}
+
+#else /* EDEBUG */
+
+#define check_bio(bio) do {} while (0)
+
+#endif
+
+void bio_reset(struct bio *bio)
+{
+ struct bio_vec *bv = bio->bi_io_vec;
+ unsigned max_vecs = bio->bi_max_vecs;
+ bio_destructor_t *destructor = bio->bi_destructor;
+
+ bio_init(bio);
+ atomic_set(&bio->bi_cnt, 2);
+ bio->bi_max_vecs = max_vecs;
+ bio->bi_io_vec = bv;
+ bio->bi_destructor = destructor;
+}
+EXPORT_SYMBOL_GPL(bio_reset);
+
+void bio_map(struct bio *bio, void *base)
+{
+ size_t size = bio->bi_size;
+ struct bio_vec *bv = bio->bi_inline_vecs;
+
+ BUG_ON(!bio->bi_size);
+ bio->bi_vcnt = 0;
+ bio->bi_io_vec = bv;
+
+ bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+ goto start;
+
+ for (; size; bio->bi_vcnt++, bv++) {
+ bv->bv_offset = 0;
+start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
+ size);
+ if (base) {
+ bv->bv_page = is_vmalloc_addr(base)
+ ? vmalloc_to_page(base)
+ : virt_to_page(base);
+
+ base += bv->bv_len;
+ }
+
+ size -= bv->bv_len;
+ }
+}
+EXPORT_SYMBOL_GPL(bio_map);
+
+#undef bio_alloc_pages
+int bio_alloc_pages(struct bio *bio, gfp_t gfp)
+{
+ int i;
+ struct bio_vec *bv;
+ bio_for_each_segment(bv, bio, i) {
+ bv->bv_page = alloc_page(gfp);
+ if (!bv->bv_page) {
+ while (bv-- != bio->bi_io_vec + bio->bi_idx)
+ __free_page(bv->bv_page);
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_pages);
+
+struct bio *bio_split_front(struct bio *bio, int sectors, bio_alloc_fn *_alloc,
+ gfp_t gfp, struct bio_set *bs)
+{
+ unsigned idx, vcnt = 0, nbytes = sectors << 9;
+ struct bio_vec *bv;
+ struct bio *ret = NULL;
+
+ struct bio *alloc(int n)
+ {
+ if (bs)
+ return bio_alloc_bioset(gfp, n, bs);
+ else if (_alloc)
+ return _alloc(gfp, n);
+ else
+ return bio_kmalloc(gfp, n);
+ }
+
+ if (current->bio_list)
+ bs = NULL;
+
+ BUG_ON(sectors <= 0);
+
+ if (nbytes >= bio->bi_size)
+ return bio;
+
+ bio_for_each_segment(bv, bio, idx) {
+ vcnt = idx - bio->bi_idx;
+
+ if (!nbytes) {
+ ret = alloc(0);
+ if (!ret)
+ return NULL;
+
+ ret->bi_io_vec = bio_iovec(bio);
+ ret->bi_flags |= 1 << BIO_CLONED;
+ break;
+ } else if (nbytes < bv->bv_len) {
+ ret = alloc(++vcnt);
+ if (!ret)
+ return NULL;
+
+ memcpy(ret->bi_io_vec, bio_iovec(bio),
+ sizeof(struct bio_vec) * vcnt);
+
+ ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
+ bv->bv_offset += nbytes;
+ bv->bv_len -= nbytes;
+ break;
+ }
+
+ nbytes -= bv->bv_len;
+ }
+
+ ret->bi_bdev = bio->bi_bdev;
+ ret->bi_sector = bio->bi_sector;
+ ret->bi_size = sectors << 9;
+ ret->bi_rw = bio->bi_rw;
+ ret->bi_vcnt = vcnt;
+ ret->bi_max_vecs = vcnt;
+ ret->bi_end_io = bio->bi_end_io;
+ ret->bi_private = bio->bi_private;
+
+ if (ret && ret != bio && bs) {
+ ret->bi_flags |= 1 << BIO_HAS_POOL;
+ ret->bi_destructor = (void *) bs;
+ }
+
+ bio->bi_sector += sectors;
+ bio->bi_size -= sectors << 9;
+ bio->bi_idx = idx;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bio_split_front);
+
+unsigned __bio_max_sectors(struct bio *bio, struct block_device *bdev,
+ sector_t sector)
+{
+ unsigned ret = bio_sectors(bio);
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct bio_vec *end = bio_iovec(bio) +
+ min_t(int, bio_segments(bio), queue_max_segments(q));
+
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bdev,
+ .bi_sector = sector,
+ .bi_size = 0,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (bio_segments(bio) > queue_max_segments(q) ||
+ q->merge_bvec_fn) {
+ ret = 0;
+
+ for (struct bio_vec *bv = bio_iovec(bio); bv < end; bv++) {
+ if (q->merge_bvec_fn &&
+ q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
+ break;
+
+ ret += bv->bv_len >> 9;
+ bvm.bi_size += bv->bv_len;
+ }
+ }
+
+ ret = min(ret, queue_max_sectors(q));
+
+ WARN_ON(!ret);
+ ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__bio_max_sectors);
+
+int bio_submit_split(struct bio *bio, atomic_t *i, struct bio_set *bs)
+{
+ struct bio *n;
+
+ do {
+ n = bio_split_front(bio, bio_max_sectors(bio),
+ NULL, GFP_NOIO, bs);
+ if (!n)
+ return -ENOMEM;
+ else if (n != bio)
+ atomic_inc(i);
+
+ check_bio(n);
+ generic_make_request(n);
+ } while (n != bio);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bio_submit_split);
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const uint64_t crc_table[256] = {
+ 0x0000000000000000, 0x42F0E1EBA9EA3693, 0x85E1C3D753D46D26,
+ 0xC711223CFA3E5BB5, 0x493366450E42ECDF, 0x0BC387AEA7A8DA4C,
+ 0xCCD2A5925D9681F9, 0x8E224479F47CB76A, 0x9266CC8A1C85D9BE,
+ 0xD0962D61B56FEF2D, 0x17870F5D4F51B498, 0x5577EEB6E6BB820B,
+ 0xDB55AACF12C73561, 0x99A54B24BB2D03F2, 0x5EB4691841135847,
+ 0x1C4488F3E8F96ED4, 0x663D78FF90E185EF, 0x24CD9914390BB37C,
+ 0xE3DCBB28C335E8C9, 0xA12C5AC36ADFDE5A, 0x2F0E1EBA9EA36930,
+ 0x6DFEFF5137495FA3, 0xAAEFDD6DCD770416, 0xE81F3C86649D3285,
+ 0xF45BB4758C645C51, 0xB6AB559E258E6AC2, 0x71BA77A2DFB03177,
+ 0x334A9649765A07E4, 0xBD68D2308226B08E, 0xFF9833DB2BCC861D,
+ 0x388911E7D1F2DDA8, 0x7A79F00C7818EB3B, 0xCC7AF1FF21C30BDE,
+ 0x8E8A101488293D4D, 0x499B3228721766F8, 0x0B6BD3C3DBFD506B,
+ 0x854997BA2F81E701, 0xC7B97651866BD192, 0x00A8546D7C558A27,
+ 0x4258B586D5BFBCB4, 0x5E1C3D753D46D260, 0x1CECDC9E94ACE4F3,
+ 0xDBFDFEA26E92BF46, 0x990D1F49C77889D5, 0x172F5B3033043EBF,
+ 0x55DFBADB9AEE082C, 0x92CE98E760D05399, 0xD03E790CC93A650A,
+ 0xAA478900B1228E31, 0xE8B768EB18C8B8A2, 0x2FA64AD7E2F6E317,
+ 0x6D56AB3C4B1CD584, 0xE374EF45BF6062EE, 0xA1840EAE168A547D,
+ 0x66952C92ECB40FC8, 0x2465CD79455E395B, 0x3821458AADA7578F,
+ 0x7AD1A461044D611C, 0xBDC0865DFE733AA9, 0xFF3067B657990C3A,
+ 0x711223CFA3E5BB50, 0x33E2C2240A0F8DC3, 0xF4F3E018F031D676,
+ 0xB60301F359DBE0E5, 0xDA050215EA6C212F, 0x98F5E3FE438617BC,
+ 0x5FE4C1C2B9B84C09, 0x1D14202910527A9A, 0x93366450E42ECDF0,
+ 0xD1C685BB4DC4FB63, 0x16D7A787B7FAA0D6, 0x5427466C1E109645,
+ 0x4863CE9FF6E9F891, 0x0A932F745F03CE02, 0xCD820D48A53D95B7,
+ 0x8F72ECA30CD7A324, 0x0150A8DAF8AB144E, 0x43A04931514122DD,
+ 0x84B16B0DAB7F7968, 0xC6418AE602954FFB, 0xBC387AEA7A8DA4C0,
+ 0xFEC89B01D3679253, 0x39D9B93D2959C9E6, 0x7B2958D680B3FF75,
+ 0xF50B1CAF74CF481F, 0xB7FBFD44DD257E8C, 0x70EADF78271B2539,
+ 0x321A3E938EF113AA, 0x2E5EB66066087D7E, 0x6CAE578BCFE24BED,
+ 0xABBF75B735DC1058, 0xE94F945C9C3626CB, 0x676DD025684A91A1,
+ 0x259D31CEC1A0A732, 0xE28C13F23B9EFC87, 0xA07CF2199274CA14,
+ 0x167FF3EACBAF2AF1, 0x548F120162451C62, 0x939E303D987B47D7,
+ 0xD16ED1D631917144, 0x5F4C95AFC5EDC62E, 0x1DBC74446C07F0BD,
+ 0xDAAD56789639AB08, 0x985DB7933FD39D9B, 0x84193F60D72AF34F,
+ 0xC6E9DE8B7EC0C5DC, 0x01F8FCB784FE9E69, 0x43081D5C2D14A8FA,
+ 0xCD2A5925D9681F90, 0x8FDAB8CE70822903, 0x48CB9AF28ABC72B6,
+ 0x0A3B7B1923564425, 0x70428B155B4EAF1E, 0x32B26AFEF2A4998D,
+ 0xF5A348C2089AC238, 0xB753A929A170F4AB, 0x3971ED50550C43C1,
+ 0x7B810CBBFCE67552, 0xBC902E8706D82EE7, 0xFE60CF6CAF321874,
+ 0xE224479F47CB76A0, 0xA0D4A674EE214033, 0x67C58448141F1B86,
+ 0x253565A3BDF52D15, 0xAB1721DA49899A7F, 0xE9E7C031E063ACEC,
+ 0x2EF6E20D1A5DF759, 0x6C0603E6B3B7C1CA, 0xF6FAE5C07D3274CD,
+ 0xB40A042BD4D8425E, 0x731B26172EE619EB, 0x31EBC7FC870C2F78,
+ 0xBFC9838573709812, 0xFD39626EDA9AAE81, 0x3A28405220A4F534,
+ 0x78D8A1B9894EC3A7, 0x649C294A61B7AD73, 0x266CC8A1C85D9BE0,
+ 0xE17DEA9D3263C055, 0xA38D0B769B89F6C6, 0x2DAF4F0F6FF541AC,
+ 0x6F5FAEE4C61F773F, 0xA84E8CD83C212C8A, 0xEABE6D3395CB1A19,
+ 0x90C79D3FEDD3F122, 0xD2377CD44439C7B1, 0x15265EE8BE079C04,
+ 0x57D6BF0317EDAA97, 0xD9F4FB7AE3911DFD, 0x9B041A914A7B2B6E,
+ 0x5C1538ADB04570DB, 0x1EE5D94619AF4648, 0x02A151B5F156289C,
+ 0x4051B05E58BC1E0F, 0x87409262A28245BA, 0xC5B073890B687329,
+ 0x4B9237F0FF14C443, 0x0962D61B56FEF2D0, 0xCE73F427ACC0A965,
+ 0x8C8315CC052A9FF6, 0x3A80143F5CF17F13, 0x7870F5D4F51B4980,
+ 0xBF61D7E80F251235, 0xFD913603A6CF24A6, 0x73B3727A52B393CC,
+ 0x31439391FB59A55F, 0xF652B1AD0167FEEA, 0xB4A25046A88DC879,
+ 0xA8E6D8B54074A6AD, 0xEA16395EE99E903E, 0x2D071B6213A0CB8B,
+ 0x6FF7FA89BA4AFD18, 0xE1D5BEF04E364A72, 0xA3255F1BE7DC7CE1,
+ 0x64347D271DE22754, 0x26C49CCCB40811C7, 0x5CBD6CC0CC10FAFC,
+ 0x1E4D8D2B65FACC6F, 0xD95CAF179FC497DA, 0x9BAC4EFC362EA149,
+ 0x158E0A85C2521623, 0x577EEB6E6BB820B0, 0x906FC95291867B05,
+ 0xD29F28B9386C4D96, 0xCEDBA04AD0952342, 0x8C2B41A1797F15D1,
+ 0x4B3A639D83414E64, 0x09CA82762AAB78F7, 0x87E8C60FDED7CF9D,
+ 0xC51827E4773DF90E, 0x020905D88D03A2BB, 0x40F9E43324E99428,
+ 0x2CFFE7D5975E55E2, 0x6E0F063E3EB46371, 0xA91E2402C48A38C4,
+ 0xEBEEC5E96D600E57, 0x65CC8190991CB93D, 0x273C607B30F68FAE,
+ 0xE02D4247CAC8D41B, 0xA2DDA3AC6322E288, 0xBE992B5F8BDB8C5C,
+ 0xFC69CAB42231BACF, 0x3B78E888D80FE17A, 0x7988096371E5D7E9,
+ 0xF7AA4D1A85996083, 0xB55AACF12C735610, 0x724B8ECDD64D0DA5,
+ 0x30BB6F267FA73B36, 0x4AC29F2A07BFD00D, 0x08327EC1AE55E69E,
+ 0xCF235CFD546BBD2B, 0x8DD3BD16FD818BB8, 0x03F1F96F09FD3CD2,
+ 0x41011884A0170A41, 0x86103AB85A2951F4, 0xC4E0DB53F3C36767,
+ 0xD8A453A01B3A09B3, 0x9A54B24BB2D03F20, 0x5D45907748EE6495,
+ 0x1FB5719CE1045206, 0x919735E51578E56C, 0xD367D40EBC92D3FF,
+ 0x1476F63246AC884A, 0x568617D9EF46BED9, 0xE085162AB69D5E3C,
+ 0xA275F7C11F7768AF, 0x6564D5FDE549331A, 0x279434164CA30589,
+ 0xA9B6706FB8DFB2E3, 0xEB46918411358470, 0x2C57B3B8EB0BDFC5,
+ 0x6EA7525342E1E956, 0x72E3DAA0AA188782, 0x30133B4B03F2B111,
+ 0xF7021977F9CCEAA4, 0xB5F2F89C5026DC37, 0x3BD0BCE5A45A6B5D,
+ 0x79205D0E0DB05DCE, 0xBE317F32F78E067B, 0xFCC19ED95E6430E8,
+ 0x86B86ED5267CDBD3, 0xC4488F3E8F96ED40, 0x0359AD0275A8B6F5,
+ 0x41A94CE9DC428066, 0xCF8B0890283E370C, 0x8D7BE97B81D4019F,
+ 0x4A6ACB477BEA5A2A, 0x089A2AACD2006CB9, 0x14DEA25F3AF9026D,
+ 0x562E43B4931334FE, 0x913F6188692D6F4B, 0xD3CF8063C0C759D8,
+ 0x5DEDC41A34BBEEB2, 0x1F1D25F19D51D821, 0xD80C07CD676F8394,
+ 0x9AFCE626CE85B507
+};
+
+uint64_t crc64_update(uint64_t crc, const void *_data, size_t len)
+{
+ const unsigned char *data = _data;
+
+ while (len--) {
+ int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+ crc = crc_table[i] ^ (crc << 8);
+ }
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_update);
+
+uint64_t crc64(const void *data, size_t len)
+{
+ uint64_t crc = 0xffffffffffffffff;
+
+ crc = crc64_update(crc, data, len);
+
+ return crc ^ 0xffffffffffffffff;
+}
+EXPORT_SYMBOL(crc64);
+
+unsigned popcount_64(uint64_t x)
+{
+ static const uint64_t m1 = 0x5555555555555555LLU;
+ static const uint64_t m2 = 0x3333333333333333LLU;
+ static const uint64_t m4 = 0x0f0f0f0f0f0f0f0fLLU;
+ static const uint64_t h01 = 0x0101010101010101LLU;
+
+ x -= (x >> 1) & m1;
+ x = (x & m2) + ((x >> 2) & m2);
+ x = (x + (x >> 4)) & m4;
+ return (x * h01) >> 56;
+}
+EXPORT_SYMBOL(popcount_64);
+
+unsigned popcount_32(uint32_t x)
+{
+ static const uint32_t m1 = 0x55555555;
+ static const uint32_t m2 = 0x33333333;
+ static const uint32_t m4 = 0x0f0f0f0f;
+ static const uint32_t h01 = 0x01010101;
+
+ x -= (x >> 1) & m1;
+ x = (x & m2) + ((x >> 2) & m2);
+ x = (x + (x >> 4)) & m4;
+ return (x * h01) >> 24;
+}
+EXPORT_SYMBOL(popcount_32);
diff --git a/drivers/block/bcache/util.h b/drivers/block/bcache/util.h
new file mode 100644
index 0000000..0c14cd7
--- /dev/null
+++ b/drivers/block/bcache/util.h
@@ -0,0 +1,657 @@
+
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#ifndef USHRT_MAX
+#define USHRT_MAX ((u16)(~0U))
+#define SHRT_MAX ((s16)(USHRT_MAX>>1))
+#endif
+
+#ifndef REQ_WRITE
+
+#define REQ_WRITE WRITE
+#define REQ_UNPLUG (1U << BIO_RW_UNPLUG)
+#define REQ_SYNC ((1U << BIO_RW_SYNCIO)|REQ_UNPLUG)
+#define REQ_META (1U << BIO_RW_META)
+#define REQ_RAHEAD (1U << BIO_RW_AHEAD)
+#define REQ_FLUSH (1U << BIO_RW_BARRIER)
+
+#define console_lock() acquire_console_sem()
+#define console_unlock() release_console_sem()
+
+#define blkdev_put(...) close_bdev_exclusive(__VA_ARGS__)
+#define blkdev_get_by_path(...) open_bdev_exclusive(__VA_ARGS__)
+
+#else
+
+#define REQ_UNPLUG 0U
+#define BIO_RW_DISCARD __REQ_DISCARD
+#define current_is_writer(x) true
+
+#endif
+
+extern struct workqueue_struct *system_wq;
+
+#define PAGE_SECTORS (PAGE_SIZE / 512)
+
+struct closure;
+
+#include <trace/events/bcache.h>
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
+
+#else /* EDEBUG */
+
+#define atomic_dec_bug(v) atomic_dec(v)
+#define atomic_inc_bug(v, i) atomic_inc(v)
+
+#endif
+
+#define BITMASK(name, type, field, offset, size) \
+static inline uint64_t name(const type *k) \
+{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
+ \
+static inline void SET_##name(type *k, uint64_t v) \
+{ \
+ k->field &= ~(~((uint64_t) ~0 << size) << offset); \
+ k->field |= v << offset; \
+}
+
+#define DECLARE_HEAP(type, name) \
+ struct { \
+ size_t size, used; \
+ type *data; \
+ } name
+
+#define init_heap(heap, _size, gfp) \
+({ \
+ size_t _bytes; \
+ (heap)->used = 0; \
+ (heap)->size = (_size); \
+ _bytes = (heap)->size * sizeof(*(heap)->data); \
+ (heap)->data = NULL; \
+ if (_bytes < KMALLOC_MAX_SIZE) \
+ (heap)->data = kmalloc(_bytes, (gfp)); \
+ if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \
+ (heap)->data = vmalloc(_bytes); \
+ (heap)->data; \
+})
+
+#define free_heap(heap) \
+do { \
+ if (is_vmalloc_addr((heap)->data)) \
+ vfree((heap)->data); \
+ else \
+ kfree((heap)->data); \
+ (heap)->data = NULL; \
+} while (0)
+
+#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp) \
+do { \
+ size_t _r, _j = i; \
+ \
+ for (; _j * 2 + 1 < (h)->used; _j = _r) { \
+ _r = _j * 2 + 1; \
+ if (_r + 1 < (h)->used && \
+ cmp((h)->data[_r], (h)->data[_r + 1])) \
+ _r++; \
+ \
+ if (cmp((h)->data[_r], (h)->data[_j])) \
+ break; \
+ heap_swap(h, _r, _j); \
+ } \
+} while (0)
+
+#define heap_sift_down(h, i, cmp) \
+do { \
+ while (i) { \
+ size_t p = (i - 1) / 2; \
+ if (cmp((h)->data[i], (h)->data[p])) \
+ break; \
+ heap_swap(h, i, p); \
+ i = p; \
+ } \
+} while (0)
+
+#define heap_add(h, d, cmp) \
+({ \
+ bool _r = !heap_full(h); \
+ if (_r) { \
+ size_t _i = (h)->used++; \
+ (h)->data[_i] = d; \
+ \
+ heap_sift_down(h, _i, cmp); \
+ heap_sift(h, _i, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_pop(h, d, cmp) \
+({ \
+ bool _r = (h)->used; \
+ if (_r) { \
+ (d) = (h)->data[0]; \
+ (h)->used--; \
+ heap_swap(h, 0, (h)->used); \
+ heap_sift(h, 0, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL)
+
+#define heap_full(h) ((h)->used == (h)->size)
+
+#define DECLARE_FIFO(type, name) \
+ struct { \
+ size_t front, back, size, mask; \
+ type *data; \
+ } name
+
+#define fifo_for_each(c, fifo) \
+ for (size_t _i = (fifo)->front; \
+ c = (fifo)->data[_i], _i != (fifo)->back; \
+ _i = (_i + 1) & (fifo)->mask)
+
+#define __init_fifo(fifo, gfp) \
+({ \
+ size_t _allocated_size, _bytes; \
+ BUG_ON(!(fifo)->size); \
+ \
+ _allocated_size = roundup_pow_of_two((fifo)->size + 1); \
+ _bytes = _allocated_size * sizeof(*(fifo)->data); \
+ \
+ (fifo)->mask = _allocated_size - 1; \
+ (fifo)->front = (fifo)->back = 0; \
+ (fifo)->data = NULL; \
+ \
+ if (_bytes < KMALLOC_MAX_SIZE) \
+ (fifo)->data = kmalloc(_bytes, (gfp)); \
+ if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \
+ (fifo)->data = vmalloc(_bytes); \
+ (fifo)->data; \
+})
+
+#define init_fifo_exact(fifo, _size, gfp) \
+({ \
+ (fifo)->size = (_size); \
+ __init_fifo(fifo, gfp); \
+})
+
+#define init_fifo(fifo, _size, gfp) \
+({ \
+ (fifo)->size = (_size); \
+ if ((fifo)->size > 4) \
+ (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \
+ __init_fifo(fifo, gfp); \
+})
+
+#define free_fifo(fifo) \
+do { \
+ if (is_vmalloc_addr((fifo)->data)) \
+ vfree((fifo)->data); \
+ else \
+ kfree((fifo)->data); \
+ (fifo)->data = NULL; \
+} while (0)
+
+#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask)
+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo) (!fifo_used(fifo))
+#define fifo_full(fifo) (!fifo_free(fifo))
+
+#define fifo_front(fifo) ((fifo)->data[(fifo)->front])
+#define fifo_back(fifo) \
+ ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i) \
+({ \
+ bool _r = !fifo_full((fifo)); \
+ if (_r) { \
+ (fifo)->data[(fifo)->back++] = (i); \
+ (fifo)->back &= (fifo)->mask; \
+ } \
+ _r; \
+})
+
+#define fifo_pop_front(fifo, i) \
+({ \
+ bool _r = !fifo_empty((fifo)); \
+ if (_r) { \
+ (i) = (fifo)->data[(fifo)->front++]; \
+ (fifo)->front &= (fifo)->mask; \
+ } \
+ _r; \
+})
+
+#define fifo_push_front(fifo, i) \
+({ \
+ bool _r = !fifo_full((fifo)); \
+ if (_r) { \
+ --(fifo)->front; \
+ (fifo)->front &= (fifo)->mask; \
+ (fifo)->data[(fifo)->front] = (i); \
+ } \
+ _r; \
+})
+
+#define fifo_pop_back(fifo, i) \
+({ \
+ bool _r = !fifo_empty((fifo)); \
+ if (_r) { \
+ --(fifo)->back; \
+ (fifo)->back &= (fifo)->mask; \
+ (i) = (fifo)->data[(fifo)->back] \
+ } \
+ _r; \
+})
+
+#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
+
+#define fifo_swap(l, r) \
+do { \
+ swap((l)->front, (r)->front); \
+ swap((l)->back, (r)->back); \
+ swap((l)->size, (r)->size); \
+ swap((l)->mask, (r)->mask); \
+ swap((l)->data, (r)->data); \
+} while (0)
+
+#define fifo_move(dest, src) \
+do { \
+ typeof(*((dest)->data)) _t; \
+ while (!fifo_full(dest) && \
+ fifo_pop(src, _t)) \
+ fifo_push(dest, _t); \
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
+ struct { \
+ type *freelist; \
+ type data[size]; \
+ } name
+
+#define array_alloc(array) \
+({ \
+ typeof((array)->freelist) _ret = (array)->freelist; \
+ \
+ if (_ret) \
+ (array)->freelist = *((typeof((array)->freelist) *) _ret);\
+ \
+ _ret; \
+})
+
+#define array_free(array, ptr) \
+do { \
+ typeof((array)->freelist) _ptr = ptr; \
+ \
+ *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
+ (array)->freelist = _ptr; \
+} while (0)
+
+#define array_allocator_init(array) \
+do { \
+ BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
+ (array)->freelist = NULL; \
+ \
+ for (typeof((array)->freelist) _i = (array)->data; \
+ _i < (array)->data + ARRAY_SIZE((array)->data); \
+ _i++) \
+ array_free(array, _i); \
+} while (0)
+
+#define array_freelist_empty(array) ((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t) \
+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int strtoint_h(const char *, int *);
+int strtouint_h(const char *, unsigned int *);
+int strtoll_h(const char *, long long *);
+int strtoull_h(const char *, unsigned long long *);
+
+static inline int strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+ return strtoint_h(cp, (int *) res);
+#else
+ return strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+ return strtouint_h(cp, (unsigned int *) res);
+#else
+ return strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res) \
+ (__builtin_types_compatible_p(typeof(*res), int) \
+ ? strtoint_h(cp, (void *) res) \
+ :__builtin_types_compatible_p(typeof(*res), long) \
+ ? strtol_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), long long) \
+ ? strtoll_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), unsigned int) \
+ ? strtouint_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), unsigned long) \
+ ? strtoul_h(cp, (void *) res) \
+ : __builtin_types_compatible_p(typeof(*res), unsigned long long)\
+ ? strtoull_h(cp, (void *) res) : -EINVAL)
+
+#define strtoul_safe(cp, var) \
+({ \
+ unsigned long _v; \
+ int _r = strict_strtoul(cp, 10, &_v); \
+ if (!_r) \
+ var = _v; \
+ _r; \
+})
+
+#define strtoul_safe_clamp(cp, var, min, max) \
+({ \
+ unsigned long _v; \
+ int _r = strict_strtoul(cp, 10, &_v); \
+ if (!_r) \
+ var = clamp_t(typeof(var), _v, min, max); \
+ _r; \
+})
+
+#define snprint(buf, size, var) \
+ snprintf(buf, size, \
+ __builtin_types_compatible_p(typeof(var), int) \
+ ? "%i\n" : \
+ __builtin_types_compatible_p(typeof(var), unsigned) \
+ ? "%u\n" : \
+ __builtin_types_compatible_p(typeof(var), long) \
+ ? "%li\n" : \
+ __builtin_types_compatible_p(typeof(var), unsigned long)\
+ ? "%lu\n" : \
+ __builtin_types_compatible_p(typeof(var), int64_t) \
+ ? "%lli\n" : \
+ __builtin_types_compatible_p(typeof(var), uint64_t) \
+ ? "%llu\n" : \
+ __builtin_types_compatible_p(typeof(var), const char *) \
+ ? "%s\n" : "%i\n", var)
+
+ssize_t hprint(char *buf, int64_t v);
+
+bool is_zero(const char *p, size_t n);
+int parse_uuid(const char *s, char *uuid);
+
+ssize_t sprint_string_list(char *buf, const char * const list[],
+ size_t selected);
+
+ssize_t read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+ /*
+ * all fields are in nanoseconds, averages are ewmas stored left shifted
+ * by 8
+ */
+ uint64_t max_duration;
+ uint64_t average_duration;
+ uint64_t average_frequency;
+ uint64_t last;
+};
+
+void time_stats_update(struct time_stats *stats, uint64_t time);
+
+#define NSEC_PER_ns 1L
+#define NSEC_PER_us NSEC_PER_USEC
+#define NSEC_PER_ms NSEC_PER_MSEC
+#define NSEC_PER_sec NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units) \
+ sysfs_print(name ## _ ## stat ## _ ## units, \
+ div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name, \
+ frequency_units, \
+ duration_units) \
+do { \
+ __print_time_stat(stats, name, \
+ average_frequency, frequency_units); \
+ __print_time_stat(stats, name, \
+ average_duration, duration_units); \
+ __print_time_stat(stats, name, \
+ max_duration, duration_units); \
+ \
+ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
+ ? div_s64(local_clock() - (stats)->last, \
+ NSEC_PER_ ## frequency_units) \
+ : -1LL); \
+} while (0)
+
+#define sysfs_time_stats_attribute(name, \
+ frequency_units, \
+ duration_units) \
+read_attribute(name ## _average_frequency_ ## frequency_units); \
+read_attribute(name ## _average_duration_ ## duration_units); \
+read_attribute(name ## _max_duration_ ## duration_units); \
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name, \
+ frequency_units, \
+ duration_units) \
+&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
+&sysfs_ ## name ## _average_duration_ ## duration_units, \
+&sysfs_ ## name ## _max_duration_ ## duration_units, \
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight, factor) \
+({ \
+ (ewma) *= (weight) - 1; \
+ (ewma) += (val) << factor; \
+ (ewma) /= (weight); \
+ (ewma) >> factor; \
+})
+
+#define __DIV_SAFE(n, d, zero) \
+({ \
+ typeof(n) _n = (n); \
+ typeof(d) _d = (d); \
+ _d ? _n / _d : zero; \
+})
+
+#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member) \
+({ \
+ typeof(ptr) _ptr = ptr; \
+ _ptr ? container_of(_ptr, type, member) : NULL; \
+})
+
+#define RB_INSERT(root, new, member, cmp) \
+({ \
+ __label__ dup; \
+ struct rb_node **n = &(root)->rb_node, *parent = NULL; \
+ typeof(new) this; \
+ int res, ret = -1; \
+ \
+ while (*n) { \
+ parent = *n; \
+ this = container_of(*n, typeof(*(new)), member); \
+ res = cmp(new, this); \
+ if (!res) \
+ goto dup; \
+ n = res < 0 \
+ ? &(*n)->rb_left \
+ : &(*n)->rb_right; \
+ } \
+ \
+ rb_link_node(&(new)->member, parent, n); \
+ rb_insert_color(&(new)->member, root); \
+ ret = 0; \
+dup: \
+ ret; \
+})
+
+#define RB_SEARCH(root, search, member, cmp) \
+({ \
+ struct rb_node *n = (root)->rb_node; \
+ typeof(&(search)) this, ret = NULL; \
+ int res; \
+ \
+ while (n) { \
+ this = container_of(n, typeof(search), member); \
+ res = cmp(&(search), this); \
+ if (!res) { \
+ ret = this; \
+ break; \
+ } \
+ n = res < 0 \
+ ? n->rb_left \
+ : n->rb_right; \
+ } \
+ ret; \
+})
+
+#define RB_GREATER(root, search, member, cmp) \
+({ \
+ struct rb_node *n = (root)->rb_node; \
+ typeof(&(search)) this, ret = NULL; \
+ int res; \
+ \
+ while (n) { \
+ this = container_of(n, typeof(search), member); \
+ res = cmp(&(search), this); \
+ if (res < 0) { \
+ ret = this; \
+ n = n->rb_left; \
+ } else \
+ n = n->rb_right; \
+ } \
+ ret; \
+})
+
+#define RB_FIRST(root, type, member) \
+ container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member) \
+ container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member) \
+ container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member) \
+ container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+ unsigned fract = x & ~(~0 << fract_bits);
+
+ x >>= fract_bits;
+ x = 1 << x;
+ x += (x * fract) >> fract_bits;
+
+ return x;
+}
+
+#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
+
+void bio_reset(struct bio *bio);
+void bio_map(struct bio *bio, void *base);
+
+typedef struct bio *(bio_alloc_fn)(gfp_t, int);
+
+struct bio *bio_split_front(struct bio *, int, bio_alloc_fn *,
+ gfp_t, struct bio_set *);
+
+int bio_submit_split(struct bio *bio, atomic_t *i, struct bio_set *bs);
+unsigned __bio_max_sectors(struct bio *bio, struct block_device *bdev,
+ sector_t sector);
+
+int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+
+#define bio_alloc_pages(...) \
+ (dynamic_fault() ? -ENOMEM : bio_alloc_pages(__VA_ARGS__))
+
+static inline unsigned bio_max_sectors(struct bio *bio)
+{
+ return __bio_max_sectors(bio, bio->bi_bdev, bio->bi_sector);
+}
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+ return bdev->bd_inode->i_size >> 9;
+}
+
+#ifdef CONFIG_BCACHE_LATENCY_DEBUG
+extern unsigned latency_warn_ms;
+
+#define latency_ms(j) jiffies_to_msecs(jiffies - (j))
+
+#define pr_latency(j, fmt, ...) \
+do { \
+ int _ms = latency_ms(j); \
+ if (j && latency_warn_ms && (_ms) > (int) latency_warn_ms) \
+ printk_ratelimited(KERN_DEBUG "bcache: %i ms latency " \
+ "called from %pf for " fmt "\n", _ms, \
+ __builtin_return_address(0), ##__VA_ARGS__); \
+} while (0)
+
+#define set_wait(f) ((f)->wait_time = jiffies)
+
+#else
+#define latency_ms(j) (0)
+#define pr_latency(...) do {} while (0)
+#define set_wait(j) do {} while (0)
+#endif
+
+#define closure_bio_submit_put(bio, cl, bs) \
+ bio_submit_split(bio, &(__to_internal_closure(cl))->remaining, bs)
+
+static inline int closure_bio_submit(struct bio *bio, struct closure *cl,
+ struct bio_set *bs)
+{
+ int ret;
+
+ closure_get(cl);
+ ret = closure_bio_submit_put(bio, cl, bs);
+ if (ret)
+ closure_put(cl);
+
+ return ret;
+}
+
+uint64_t crc64_update(uint64_t, const void *, size_t);
+uint64_t crc64(const void *, size_t);
+
+unsigned popcount_64(uint64_t);
+unsigned popcount_32(uint32_t);
+
+#endif /* _BCACHE_UTIL_H */
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 10/16] bcache: Superblock/initialization/sysfs code
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (4 preceding siblings ...)
2012-05-10 3:10 ` [Bcache v13 09/16] Bcache: generic utility code Kent Overstreet
@ 2012-05-10 3:10 ` Kent Overstreet
2012-05-10 3:10 ` [Bcache v13 11/16] bcache: Core btree code Kent Overstreet
` (7 subsequent siblings)
13 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:10 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/stats.c | 243 +++++
drivers/block/bcache/stats.h | 58 ++
drivers/block/bcache/super.c | 2000 ++++++++++++++++++++++++++++++++++++++++++
drivers/block/bcache/sysfs.c | 802 +++++++++++++++++
drivers/block/bcache/sysfs.h | 99 +++
5 files changed, 3202 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/stats.c
create mode 100644 drivers/block/bcache/stats.h
create mode 100644 drivers/block/bcache/super.c
create mode 100644 drivers/block/bcache/sysfs.c
create mode 100644 drivers/block/bcache/sysfs.h
diff --git a/drivers/block/bcache/stats.c b/drivers/block/bcache/stats.c
new file mode 100644
index 0000000..9019844
--- /dev/null
+++ b/drivers/block/bcache/stats.c
@@ -0,0 +1,243 @@
+#include "bcache.h"
+#include "stats.h"
+#include "btree.h"
+#include "request.h"
+#include "sysfs.h"
+
+/* We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE = 288;
+static const unsigned HOUR_RESCALE = 12;
+static const unsigned FIVE_MINUTE_RESCALE = 1;
+static const unsigned accounting_delay = (HZ * 300) / 22;
+static const unsigned accounting_weight = 32;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+
+static struct attribute *accounting_files[] = {
+ &sysfs_cache_hits,
+ &sysfs_cache_misses,
+ &sysfs_cache_bypass_hits,
+ &sysfs_cache_bypass_misses,
+ &sysfs_cache_hit_ratio,
+ &sysfs_cache_readaheads,
+ &sysfs_cache_miss_collisions,
+ &sysfs_bypassed,
+ NULL
+};
+
+ssize_t cache_stats_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf) {
+ struct cache_stats *s =
+ container_of(kobj, struct cache_stats, kobj);
+#define var(stat) (s->stat >> 16)
+ var_print(cache_hits);
+ var_print(cache_misses);
+ var_print(cache_bypass_hits);
+ var_print(cache_bypass_misses);
+
+ sysfs_print(cache_hit_ratio,
+ DIV_SAFE(var(cache_hits) * 100,
+ var(cache_hits) + var(cache_misses)));
+
+ var_print(cache_readaheads);
+ var_print(cache_miss_collisions);
+ sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
+#undef var
+ return 0;
+}
+
+/* kobjects */
+
+static void unregister_fake(struct kobject *k)
+{
+}
+
+static const struct sysfs_ops accounting_ops = {
+ .show = cache_stats_show,
+ .store = NULL
+};
+static struct kobj_type accounting_obj = {
+ .release = unregister_fake,
+ .sysfs_ops = &accounting_ops,
+ .default_attrs = accounting_files
+};
+
+static void scale_accounting(unsigned long data);
+
+void init_cache_accounting(struct cache_accounting *acc, struct closure *parent)
+{
+ kobject_init(&acc->total.kobj, &accounting_obj);
+ kobject_init(&acc->five_minute.kobj, &accounting_obj);
+ kobject_init(&acc->hour.kobj, &accounting_obj);
+ kobject_init(&acc->day.kobj, &accounting_obj);
+
+ closure_init(&acc->cl, parent);
+ init_timer(&acc->timer);
+ acc->timer.expires = jiffies + accounting_delay;
+ acc->timer.data = (unsigned long) acc;
+ acc->timer.function = scale_accounting;
+ add_timer(&acc->timer);
+}
+
+int add_cache_accounting_kobjs(struct cache_accounting *acc,
+ struct kobject *parent)
+{
+ int ret = kobject_add(&acc->total.kobj, parent,
+ "stats_total");
+ ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+ "stats_five_minute");
+ ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+ "stats_hour");
+ ret = ret ?: kobject_add(&acc->day.kobj, parent,
+ "stats_day");
+ return ret;
+}
+
+void clear_stats(struct cache_accounting *acc)
+{
+ memset(&acc->total.cache_hits,
+ 0,
+ sizeof(unsigned long) * 7);
+}
+
+void destroy_cache_accounting(struct cache_accounting *acc)
+{
+ kobject_put(&acc->total.kobj);
+ kobject_put(&acc->five_minute.kobj);
+ kobject_put(&acc->hour.kobj);
+ kobject_put(&acc->day.kobj);
+
+ atomic_set(&acc->closing, 1);
+ if (del_timer_sync(&acc->timer))
+ closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+void scale_stat(unsigned long *stat)
+{
+ *stat = ewma_add(*stat, 0, accounting_weight, 0);
+}
+
+void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+ if (++stats->rescale == rescale_at) {
+ stats->rescale = 0;
+ scale_stat(&stats->cache_hits);
+ scale_stat(&stats->cache_misses);
+ scale_stat(&stats->cache_bypass_hits);
+ scale_stat(&stats->cache_bypass_misses);
+ scale_stat(&stats->cache_readaheads);
+ scale_stat(&stats->cache_miss_collisions);
+ scale_stat(&stats->sectors_bypassed);
+ }
+}
+
+void scale_accounting(unsigned long data)
+{
+ struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do { \
+ unsigned long t = atomic_xchg(&acc->collector.name, 0); \
+ t <<= 16; \
+ acc->five_minute.name += t; \
+ acc->hour.name += t; \
+ acc->day.name += t; \
+ acc->total.name += t; \
+} while (0)
+
+ move_stat(cache_hits);
+ move_stat(cache_misses);
+ move_stat(cache_bypass_hits);
+ move_stat(cache_bypass_misses);
+ move_stat(cache_readaheads);
+ move_stat(cache_miss_collisions);
+ move_stat(sectors_bypassed);
+
+ scale_stats(&acc->total, 0);
+ scale_stats(&acc->day, DAY_RESCALE);
+ scale_stats(&acc->hour, HOUR_RESCALE);
+ scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+ acc->timer.expires += accounting_delay;
+
+ if (!atomic_read(&acc->closing))
+ add_timer(&acc->timer);
+ else
+ closure_return(&acc->cl);
+}
+
+static void mark_cache_stats(struct cache_stat_collector *stats,
+ bool hit, bool bypass)
+{
+ if (!bypass)
+ if (hit)
+ atomic_inc(&stats->cache_hits);
+ else
+ atomic_inc(&stats->cache_misses);
+ else
+ if (hit)
+ atomic_inc(&stats->cache_bypass_hits);
+ else
+ atomic_inc(&stats->cache_bypass_misses);
+}
+
+void mark_cache_accounting(struct search *s, bool hit, bool bypass)
+{
+ struct cached_dev *dc = container_of(s->op.d, struct cached_dev, disk);
+ mark_cache_stats(&dc->accounting.collector, hit, bypass);
+ mark_cache_stats(&s->op.d->c->accounting.collector, hit, bypass);
+#ifdef CONFIG_CGROUP_BCACHE
+ mark_cache_stats(&(bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
+#endif
+}
+
+void mark_cache_readahead(struct search *s)
+{
+ struct cached_dev *dc = container_of(s->op.d, struct cached_dev, disk);
+ atomic_inc(&dc->accounting.collector.cache_readaheads);
+ atomic_inc(&s->op.d->c->accounting.collector.cache_readaheads);
+}
+void mark_cache_miss_collision(struct btree_op *op)
+{
+ struct cached_dev *dc = container_of(op->d, struct cached_dev, disk);
+ atomic_inc(&dc->accounting.collector.cache_miss_collisions);
+ atomic_inc(&op->d->c->accounting.collector.cache_miss_collisions);
+}
+void mark_sectors_bypassed(struct search *s, int sectors)
+{
+ struct cached_dev *dc = container_of(s->op.d, struct cached_dev, disk);
+ atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+ atomic_add(sectors, &s->op.d->c->accounting.collector.sectors_bypassed);
+}
diff --git a/drivers/block/bcache/stats.h b/drivers/block/bcache/stats.h
new file mode 100644
index 0000000..f867751
--- /dev/null
+++ b/drivers/block/bcache/stats.h
@@ -0,0 +1,58 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+struct cache_stat_collector {
+ atomic_t cache_hits;
+ atomic_t cache_misses;
+ atomic_t cache_bypass_hits;
+ atomic_t cache_bypass_misses;
+ atomic_t cache_readaheads;
+ atomic_t cache_miss_collisions;
+ atomic_t sectors_bypassed;
+};
+
+struct cache_stats {
+ struct kobject kobj;
+
+ unsigned long cache_hits;
+ unsigned long cache_misses;
+ unsigned long cache_bypass_hits;
+ unsigned long cache_bypass_misses;
+ unsigned long cache_readaheads;
+ unsigned long cache_miss_collisions;
+ unsigned long sectors_bypassed;
+
+ unsigned rescale;
+};
+
+struct cache_accounting {
+ struct closure cl;
+ struct timer_list timer;
+ atomic_t closing;
+
+ struct cache_stat_collector collector;
+
+ struct cache_stats total;
+ struct cache_stats five_minute;
+ struct cache_stats hour;
+ struct cache_stats day;
+};
+
+void init_cache_accounting(struct cache_accounting *acc,
+ struct closure *parent);
+
+int add_cache_accounting_kobjs(struct cache_accounting *acc,
+ struct kobject *parent);
+
+void clear_stats(struct cache_accounting *acc);
+
+void destroy_cache_accounting(struct cache_accounting *acc);
+
+struct search;
+void mark_cache_accounting(struct search *s, bool hit, bool bypass);
+
+void mark_cache_readahead(struct search *s);
+void mark_cache_miss_collision(struct btree_op *op);
+void mark_sectors_bypassed(struct search *s, int sectors);
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/block/bcache/super.c b/drivers/block/bcache/super.c
new file mode 100644
index 0000000..70d7bcd
--- /dev/null
+++ b/drivers/block/bcache/super.c
@@ -0,0 +1,2000 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+#include "sysfs.h"
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sort.h>
+#include <linux/sysfs.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const char bcache_magic[] = {
+ 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
+};
+
+static const char invalid_uuid[] = {
+ 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+ 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bcache_cache_modes[] = {
+ "default",
+ "writethrough",
+ "writeback",
+ "writearound",
+ "none",
+ NULL
+};
+
+static const char * const cache_replacement_policies[] = {
+ "lru",
+ "fifo",
+ "random",
+ NULL
+};
+
+struct uuid_entry_v0 {
+ uint8_t uuid[16];
+ uint8_t label[32];
+ uint32_t first_reg;
+ uint32_t last_reg;
+ uint32_t invalidated;
+ uint32_t pad;
+};
+
+struct uuid_entry {
+ union {
+ struct {
+ uint8_t uuid[16];
+ uint8_t label[32];
+ uint32_t first_reg;
+ uint32_t last_reg;
+ uint32_t invalidated;
+
+ uint32_t flags;
+ /* Size of flash only volumes */
+ uint64_t sectors;
+ };
+
+ uint8_t pad[128];
+ };
+};
+
+BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
+
+/* We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t and when the rescale
+ * function it runs it resets the atomic counter to 0 and adds its old value to
+ * each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_accounting are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned accounting_rescale[] = { 0, 1, 12, 288 };
+static const unsigned accounting_delay = (HZ * 300) / 22;
+static const unsigned accounting_weight = 32;
+
+static const char * const accounting_types[] = {
+ "total", "five_minute", "hour", "day" };
+
+static struct kobject *bcache_kobj;
+static struct mutex register_lock;
+static LIST_HEAD(uncached_devices);
+static LIST_HEAD(cache_sets);
+static int bcache_major, bcache_minor;
+static wait_queue_head_t unregister_wait;
+
+struct workqueue_struct *bcache_wq;
+
+static int uuid_write(struct cache_set *);
+static void bcache_device_stop(struct bcache_device *);
+
+static void __cached_dev_free(struct kobject *);
+static void cached_dev_run(struct cached_dev *);
+static int cached_dev_attach(struct cached_dev *, struct cache_set *);
+static void cached_dev_detach(struct cached_dev *);
+
+static void __flash_dev_free(struct kobject *);
+static int flash_dev_create(struct cache_set *c, uint64_t size);
+
+static void __cache_set_free(struct kobject *);
+static void cache_set_unregister(struct cache_set *);
+static void cache_set_stop(struct cache_set *);
+static void bcache_write_super(struct cache_set *);
+
+static void cache_free(struct kobject *);
+
+#include "sysfs.c"
+
+#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
+
+/* Superblock */
+
+static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+ struct page **res)
+{
+ const char *err;
+ struct cache_sb *s;
+ struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+
+ if (!bh)
+ return "IO error";
+
+ s = (struct cache_sb *) bh->b_data;
+
+ sb->offset = le64_to_cpu(s->offset);
+ sb->version = le64_to_cpu(s->version);
+
+ memcpy(sb->magic, s->magic, 16);
+ memcpy(sb->uuid, s->uuid, 16);
+ memcpy(sb->set_uuid, s->set_uuid, 16);
+ memcpy(sb->label, s->label, SB_LABEL_SIZE);
+
+ sb->flags = le64_to_cpu(s->flags);
+ sb->seq = le64_to_cpu(s->seq);
+
+ sb->nbuckets = le64_to_cpu(s->nbuckets);
+ sb->block_size = le16_to_cpu(s->block_size);
+ sb->bucket_size = le16_to_cpu(s->bucket_size);
+
+ sb->nr_in_set = le16_to_cpu(s->nr_in_set);
+ sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
+ sb->last_mount = le32_to_cpu(s->last_mount);
+
+ sb->first_bucket = le16_to_cpu(s->first_bucket);
+ sb->keys = le16_to_cpu(s->keys);
+
+ for (int i = 0; i < SB_JOURNAL_BUCKETS; i++)
+ sb->d[i] = le64_to_cpu(s->d[i]);
+
+ pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+ sb->version, sb->flags, sb->seq, sb->keys);
+
+ err = "Not a bcache superblock";
+ if (sb->offset != SB_SECTOR)
+ goto err;
+
+ if (memcmp(sb->magic, bcache_magic, 16))
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->keys > SB_JOURNAL_BUCKETS)
+ goto err;
+
+ err = "Bad checksum";
+ if (s->csum != csum_set(s))
+ goto err;
+
+ err = "Bad UUID";
+ if (is_zero(sb->uuid, 16))
+ goto err;
+
+ err = "Unsupported superblock version";
+ if (sb->version > BCACHE_SB_VERSION)
+ goto err;
+
+ err = "Bad block/bucket size";
+ if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS ||
+ !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS)
+ goto err;
+
+ err = "Too many buckets";
+ if (sb->nbuckets > LONG_MAX)
+ goto err;
+
+ err = "Not enough buckets";
+ if (sb->nbuckets < 1 << 7)
+ goto err;
+
+ err = "Invalid superblock: device too small";
+ if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
+ goto err;
+
+ if (sb->version == CACHE_BACKING_DEV)
+ goto out;
+
+ err = "Bad UUID";
+ if (is_zero(sb->set_uuid, 16))
+ goto err;
+
+ err = "Bad cache device number in set";
+ if (!sb->nr_in_set ||
+ sb->nr_in_set <= sb->nr_this_dev ||
+ sb->nr_in_set > MAX_CACHES_PER_SET)
+ goto err;
+
+ err = "Journal buckets not sequential";
+ for (unsigned i = 0; i < sb->keys; i++)
+ if (sb->d[i] != sb->first_bucket + i)
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+ goto err;
+
+ err = "Invalid superblock: first bucket comes before end of super";
+ if (sb->first_bucket * sb->bucket_size < 16)
+ goto err;
+out:
+ sb->last_mount = get_seconds();
+ err = NULL;
+
+ get_page(bh->b_page);
+ *res = bh->b_page;
+err:
+ put_bh(bh);
+ return err;
+}
+
+static void write_bdev_super_endio(struct bio *bio, int error)
+{
+ struct cached_dev *d = bio->bi_private;
+ /* XXX: error checking */
+
+ closure_put(&d->sb_write.cl);
+}
+
+static void __write_super(struct cache_sb *sb, struct bio *bio)
+{
+ struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+
+ bio->bi_sector = SB_SECTOR;
+ bio->bi_rw = REQ_SYNC|REQ_META;
+ bio->bi_size = SB_SIZE;
+ bio_map(bio, NULL);
+
+ out->offset = cpu_to_le64(sb->offset);
+ out->version = cpu_to_le64(sb->version);
+
+ memcpy(out->uuid, sb->uuid, 16);
+ memcpy(out->set_uuid, sb->set_uuid, 16);
+ memcpy(out->label, sb->label, SB_LABEL_SIZE);
+
+ out->flags = cpu_to_le64(sb->flags);
+ out->seq = cpu_to_le64(sb->seq);
+
+ out->last_mount = cpu_to_le32(sb->last_mount);
+ out->first_bucket = cpu_to_le16(sb->first_bucket);
+ out->keys = cpu_to_le16(sb->keys);
+
+ for (int i = 0; i < sb->keys; i++)
+ out->d[i] = cpu_to_le64(sb->d[i]);
+
+ out->csum = csum_set(out);
+
+ pr_debug("ver %llu, flags %llu, seq %llu",
+ sb->version, sb->flags, sb->seq);
+
+ submit_bio(REQ_WRITE, bio);
+}
+
+void write_bdev_super(struct cached_dev *d, struct closure *parent)
+{
+ struct closure *cl = &d->sb_write.cl;
+ struct bio *bio = &d->sb_bio;
+
+ closure_lock(&d->sb_write, parent);
+
+ bio_reset(bio);
+ bio->bi_bdev = d->bdev;
+ bio->bi_end_io = write_bdev_super_endio;
+ bio->bi_private = d;
+
+ closure_get(cl);
+ __write_super(&d->sb, bio);
+
+ closure_return(cl);
+}
+
+static void write_super_endio(struct bio *bio, int error)
+{
+ struct cache *c = bio->bi_private;
+
+ count_io_errors(c, error, "writing superblock");
+ closure_put(&c->set->sb_write.cl);
+}
+
+static void bcache_write_super(struct cache_set *c)
+{
+ struct closure *cl = &c->sb_write.cl;
+ struct cache *ca;
+
+ closure_lock(&c->sb_write, &c->cl);
+
+ c->sb.seq++;
+
+ for_each_cache(ca, c) {
+ struct bio *bio = &ca->sb_bio;
+
+ ca->sb.version = BCACHE_SB_VERSION;
+ ca->sb.seq = c->sb.seq;
+ ca->sb.last_mount = c->sb.last_mount;
+
+ SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+
+ bio_reset(bio);
+ bio->bi_bdev = ca->bdev;
+ bio->bi_end_io = write_super_endio;
+ bio->bi_private = ca;
+
+ closure_get(cl);
+ __write_super(&ca->sb, bio);
+ }
+
+ closure_return(cl);
+}
+
+/* UUID io */
+
+static void uuid_endio(struct bio *bio, int error)
+{
+ /* XXX: check for io errors */
+ bcache_endio(container_of(bio->bi_private, struct cache_set,
+ uuid_write),
+ bio, error, "accessing uuids");
+}
+
+static void uuid_io(struct cache_set *c, unsigned long rw,
+ struct bkey *k, struct closure *parent)
+{
+ struct closure *cl = &c->uuid_write.cl;
+
+ BUG_ON(!parent);
+ closure_lock(&c->uuid_write, parent);
+
+ for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+ struct bio *bio = PTR_CACHE(c, k, i)->uuid_bio;
+
+ bio_reset(bio);
+ bio->bi_rw = REQ_SYNC|REQ_META|rw;
+ bio->bi_size = KEY_SIZE(k) << 9;
+
+ bio->bi_end_io = uuid_endio;
+ bio->bi_private = cl;
+ bio_map(bio, c->uuids);
+
+ submit_bbio_split(bio, c, k, i);
+
+ if (!(rw & WRITE))
+ break;
+ }
+
+ pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
+ pkey(&c->uuid_bucket));
+
+ for (struct uuid_entry *u = c->uuids; u < c->uuids + c->nr_uuids; u++)
+ if (!is_zero(u->uuid, 16))
+ pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
+ u - c->uuids, u->uuid, u->label,
+ u->first_reg, u->last_reg, u->invalidated);
+
+ closure_return(cl);
+}
+
+static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
+{
+ struct bkey *k = &j->uuid_bucket;
+
+ if (__ptr_invalid(c, 1, k))
+ return "bad uuid pointer";
+
+ bkey_copy(&c->uuid_bucket, k);
+ uuid_io(c, READ_SYNC, k, cl);
+
+ if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
+ struct uuid_entry_v0 *u0 = (void *) c->uuids;
+ struct uuid_entry *u1 = (void *) c->uuids;
+
+ closure_sync(cl);
+
+ /*
+ * Since the new uuid entry is bigger than the old, we have to
+ * convert starting at the highest memory address and work down
+ * in order to do it in place
+ */
+
+ for (int i = c->nr_uuids - 1;
+ i >= 0;
+ --i) {
+ memcpy(u1[i].uuid, u0[i].uuid, 16);
+ memcpy(u1[i].label, u0[i].label, 32);
+
+ u1[i].first_reg = u0[i].first_reg;
+ u1[i].last_reg = u0[i].last_reg;
+ u1[i].invalidated = u0[i].invalidated;
+
+ u1[i].flags = 0;
+ u1[i].sectors = 0;
+ }
+ }
+
+ return NULL;
+}
+
+static int __uuid_write(struct cache_set *c)
+{
+ BKEY_PADDED(key) k;
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ lockdep_assert_held(®ister_lock);
+
+ if (pop_bucket_set(c, btree_prio, &k.key, 1, &cl))
+ return 1;
+
+ SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+ uuid_io(c, REQ_WRITE, &k.key, &cl);
+ closure_sync(&cl);
+
+ bkey_copy(&c->uuid_bucket, &k.key);
+ __bkey_put(c, &k.key);
+ return 0;
+}
+
+static int uuid_write(struct cache_set *c)
+{
+ int ret = __uuid_write(c);
+
+ if (!ret)
+ bcache_journal_meta(c, NULL);
+
+ return ret;
+}
+
+static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
+{
+ for (struct uuid_entry *u = c->uuids;
+ u < c->uuids + c->nr_uuids; u++)
+ if (!memcmp(u->uuid, uuid, 16))
+ return u;
+
+ return NULL;
+}
+
+static struct uuid_entry *uuid_find_empty(struct cache_set *c)
+{
+ static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+ return uuid_find(c, zero_uuid);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+ * 8 bit gen
+ * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is primarily used by the allocation code; periodically (whenever
+ * it runs out of buckets to allocate from) the allocation code will invalidate
+ * some buckets, but it can't use those buckets until their new gens are safely
+ * on disk.
+ *
+ * So it calls prio_write(), which does a bunch of work and eventually stores
+ * the pointer to the new first prio bucket in the current open journal entry
+ * header; when that journal entry is written, we can mark the buckets that have
+ * been invalidated as being ready for use by toggling c->prio_written.
+ */
+
+static void prio_endio(struct bio *bio, int error)
+{
+ struct cache *c = bio->bi_private;
+ BUG_ON(c->prio_bio->bi_flags & (1 << BIO_HAS_POOL));
+ count_io_errors(c, error, "writing priorities");
+
+ bio_put(bio);
+ closure_put(&c->prio);
+}
+
+static void prio_io(struct cache *c, uint64_t bucket, unsigned long rw)
+{
+ struct bio *bio = c->prio_bio;
+
+ bio_reset(bio);
+ bio->bi_sector = bucket * c->sb.bucket_size;
+ bio->bi_bdev = c->bdev;
+ bio->bi_rw = REQ_SYNC|REQ_META|rw;
+ bio->bi_size = bucket_bytes(c);
+
+ bio->bi_end_io = prio_endio;
+ bio->bi_private = c;
+ bio_map(bio, c->disk_buckets);
+
+ closure_bio_submit(bio, &c->prio, c->set ? c->set->bio_split : NULL);
+}
+
+#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
+ fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
+
+static void prio_write_done(struct closure *cl)
+{
+ struct cache *c = container_of(cl, struct cache, prio);
+
+ pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&c->free),
+ fifo_used(&c->free_inc), fifo_used(&c->unused));
+ blktrace_msg(c, "Finished priorities: " buckets_free(c));
+
+ mutex_lock(&c->set->bucket_lock);
+
+ /*
+ * XXX: Terrible hack
+ *
+ * We really should be using this closure as the lock for writing
+ * priorities, but we don't - we use c->prio_written. So we have to
+ * finish with the closure before we unlock bucket_lock:
+ */
+ set_closure_fn(&c->prio, NULL, NULL);
+ closure_set_stopped(&c->prio);
+ closure_put(&c->prio);
+
+ atomic_set(&c->prio_written, 1);
+ mutex_unlock(&c->set->bucket_lock);
+
+ closure_wake_up(&c->set->bucket_wait);
+}
+
+static void prio_write_journal(struct closure *cl)
+{
+ struct cache *c = container_of(cl, struct cache, prio);
+
+ pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&c->free),
+ fifo_used(&c->free_inc), fifo_used(&c->unused));
+ blktrace_msg(c, "Journalling priorities: " buckets_free(c));
+
+ mutex_lock(&c->set->bucket_lock);
+
+ for (unsigned i = 0; i < prio_buckets(c); i++)
+ c->prio_buckets[i] = c->prio_next[i];
+
+ c->prio_alloc = 0;
+ c->need_save_prio = 0;
+
+ /*
+ * We have to call bcache_journal_meta() with bucket_lock still held,
+ * because after we set prio_buckets = prio_next things are inconsistent
+ * until the next journal entry is updated
+ */
+ bcache_journal_meta(c->set, cl);
+
+ mutex_unlock(&c->set->bucket_lock);
+
+ continue_at(cl, prio_write_done, system_wq);
+}
+
+static void prio_write_bucket(struct closure *cl)
+{
+ struct cache *c = container_of(cl, struct cache, prio);
+ struct prio_set *p = c->disk_buckets;
+ struct bucket_disk *d = p->data, *end = d + prios_per_bucket(c);
+
+ unsigned i = c->prio_write++;
+
+ for (struct bucket *b = c->buckets + i * prios_per_bucket(c);
+ b < c->buckets + c->sb.nbuckets && d < end;
+ b++, d++) {
+ d->prio = cpu_to_le16(b->prio);
+ d->gen = b->disk_gen;
+ }
+
+ if (c->prio_write != prio_buckets(c))
+ p->next_bucket = c->prio_next[c->prio_write];
+
+ p->magic = pset_magic(c);
+ p->csum = crc64(&p->magic, bucket_bytes(c) - 8);
+
+ prio_io(c, c->prio_next[i], REQ_WRITE);
+
+ continue_at(cl, c->prio_write == prio_buckets(c)
+ ? prio_write_journal
+ : prio_write_bucket, system_wq);
+}
+
+void prio_write(struct cache *c)
+{
+ lockdep_assert_held(&c->set->bucket_lock);
+ BUG_ON(atomic_read(&c->prio_written));
+ BUG_ON(c->prio_alloc != prio_buckets(c));
+
+ closure_init(&c->prio, &c->set->cl);
+
+ for (struct bucket *b = c->buckets;
+ b < c->buckets + c->sb.nbuckets; b++)
+ b->disk_gen = b->gen;
+
+ c->prio_write = 0;
+ c->disk_buckets->seq++;
+
+ atomic_long_add(c->sb.bucket_size * prio_buckets(c),
+ &c->meta_sectors_written);
+
+ atomic_set(&c->prio_written, -1);
+
+ pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&c->free),
+ fifo_used(&c->free_inc), fifo_used(&c->unused));
+ blktrace_msg(c, "Starting priorities: " buckets_free(c));
+
+ continue_at(&c->prio, prio_write_bucket, system_wq);
+}
+
+static int prio_read(struct cache *c, uint64_t bucket)
+{
+ struct prio_set *p = c->disk_buckets;
+ struct bucket_disk *d = p->data + prios_per_bucket(c), *end = d;
+
+ closure_init(&c->prio, NULL);
+
+ for (struct bucket *b = c->buckets;
+ b < c->buckets + c->sb.nbuckets;
+ b++, d++) {
+ if (d == end) {
+ c->prio_buckets[c->prio_write++] = bucket;
+
+ prio_io(c, bucket, READ_SYNC);
+ closure_sync(&c->prio);
+
+ /* XXX: doesn't get error handling right with splits */
+ if (!test_bit(BIO_UPTODATE, &c->prio_bio->bi_flags))
+ continue_at(&c->prio, NULL, NULL, -1);
+
+ if (p->csum != crc64(&p->magic, bucket_bytes(c) - 8))
+ printk(KERN_WARNING "bcache: "
+ "bad csum reading priorities\n");
+
+ if (p->magic != pset_magic(c))
+ printk(KERN_WARNING "bcache: "
+ "bad magic reading priorities\n");
+
+ bucket = p->next_bucket;
+ d = p->data;
+ }
+
+ b->prio = le16_to_cpu(d->prio);
+ b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+ }
+
+ continue_at(&c->prio, NULL, NULL, 0);
+}
+
+/* Bcache device */
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+ struct bcache_device *d = b->bd_disk->private_data;
+ if (atomic_read(&d->closing))
+ return -ENXIO;
+
+ closure_get(&d->cl);
+ return 0;
+}
+
+static int release_dev(struct gendisk *b, fmode_t mode)
+{
+ struct bcache_device *d = b->private_data;
+ closure_put(&d->cl);
+ return 0;
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct bcache_device *d = b->bd_disk->private_data;
+ return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+ .open = open_dev,
+ .release = release_dev,
+ .ioctl = ioctl_dev,
+ .owner = THIS_MODULE,
+};
+
+static void bcache_device_stop(struct bcache_device *d)
+{
+ if (!atomic_xchg(&d->closing, 1))
+ closure_queue(&d->cl);
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+ lockdep_assert_held(®ister_lock);
+
+ if (atomic_read(&d->detaching)) {
+ struct uuid_entry *u = d->c->uuids + d->id;
+
+ SET_UUID_FLASH_ONLY(u, 0);
+ memcpy(u->uuid, invalid_uuid, 16);
+ u->invalidated = cpu_to_le32(get_seconds());
+ uuid_write(d->c);
+
+ atomic_set(&d->detaching, 0);
+ }
+
+ sysfs_remove_link(&d->c->kobj, d->name);
+ sysfs_remove_link(&d->kobj, "cache");
+
+ d->c->devices[d->id] = NULL;
+ closure_put(&d->c->caching);
+ d->c = NULL;
+}
+
+static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
+ unsigned id)
+{
+ BUG_ON(atomic_read(&c->closing));
+
+ d->id = id;
+ d->c = c;
+ c->devices[id] = d;
+
+ closure_get(&c->caching);
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+ const char *name)
+{
+ snprintf(d->name, BCACHEDEVNAME_SIZE,
+ "%s%u", name, d->id);
+
+ WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+ sysfs_create_link(&c->kobj, &d->kobj, d->name),
+ "Couldn't create device <-> cache set symlinks");
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+ lockdep_assert_held(®ister_lock);
+
+ printk(KERN_INFO "bcache: %s stopped\n", d->disk->disk_name);
+
+ if (d->c)
+ bcache_device_detach(d);
+
+ if (d->disk)
+ del_gendisk(d->disk);
+ if (d->disk && d->disk->queue)
+ blk_cleanup_queue(d->disk->queue);
+ if (d->disk)
+ put_disk(d->disk);
+
+ if (d->unaligned_bvec)
+ mempool_destroy(d->unaligned_bvec);
+ if (d->bio_split)
+ bioset_free(d->bio_split);
+
+ closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size)
+{
+ struct request_queue *q;
+
+ if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+ !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
+ sizeof(struct bio_vec) * BIO_MAX_PAGES)))
+ return -ENOMEM;
+
+ d->disk = alloc_disk(1);
+ if (!d->disk)
+ return -ENOMEM;
+
+ snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
+
+ d->disk->major = bcache_major;
+ d->disk->first_minor = bcache_minor++;
+ d->disk->fops = &bcache_ops;
+ d->disk->private_data = d;
+
+ q = blk_alloc_queue(GFP_KERNEL);
+ if (!q)
+ return -ENOMEM;
+
+ blk_queue_make_request(q, NULL);
+ d->disk->queue = q;
+ q->queuedata = d;
+ q->backing_dev_info.congested_data = d;
+ q->limits.max_hw_sectors = UINT_MAX;
+ q->limits.max_sectors = UINT_MAX;
+ q->limits.max_segment_size = UINT_MAX;
+ q->limits.max_segments = BIO_MAX_PAGES;
+ q->limits.max_discard_sectors = UINT_MAX;
+ q->limits.io_min = block_size;
+ q->limits.logical_block_size = block_size;
+ q->limits.physical_block_size = block_size;
+ set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
+ set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
+
+ return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+ uint64_t sectors = 0;
+ struct cached_dev *dc;
+
+ list_for_each_entry(dc, &c->cached_devs, list)
+ sectors += bdev_sectors(dc->bdev);
+
+ c->cached_dev_sectors = sectors;
+}
+
+static void cached_dev_run(struct cached_dev *dc)
+{
+ struct bcache_device *d = &dc->disk;
+
+ if (atomic_xchg(&dc->running, 1))
+ return;
+
+ if (!d->c &&
+ BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
+ write_bdev_super(dc, &cl);
+ closure_sync(&cl);
+ }
+
+ add_disk(d->disk);
+#if 0
+ char *env[] = { "SYMLINK=label" , NULL };
+ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+#endif
+ if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+ sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+ pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+ struct cached_dev *d = container_of(w, struct cached_dev, detach);
+ char buf[BDEVNAME_SIZE];
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ mutex_lock(®ister_lock);
+
+ BUG_ON(!atomic_read(&d->disk.detaching));
+ BUG_ON(atomic_read(&d->count));
+
+ memset(&d->sb.set_uuid, 0, 16);
+ SET_BDEV_STATE(&d->sb, BDEV_STATE_NONE);
+
+ write_bdev_super(d, &cl);
+ closure_sync(&cl);
+
+ bcache_device_detach(&d->disk);
+ list_move(&d->list, &uncached_devices);
+
+ mutex_unlock(®ister_lock);
+
+ printk(KERN_DEBUG "bcache: Caching disabled for %s\n",
+ bdevname(d->bdev, buf));
+}
+
+static void cached_dev_detach(struct cached_dev *d)
+{
+ lockdep_assert_held(®ister_lock);
+
+ if (atomic_xchg(&d->disk.detaching, 1))
+ return;
+
+ bcache_writeback_queue(d);
+ cached_dev_put(d);
+}
+
+static int cached_dev_attach(struct cached_dev *d, struct cache_set *c)
+{
+ uint32_t rtime = cpu_to_le32(get_seconds());
+ struct uuid_entry *u;
+ char buf[BDEVNAME_SIZE];
+
+ bdevname(d->bdev, buf);
+
+ if (d->disk.c ||
+ atomic_read(&c->closing) ||
+ memcmp(d->sb.set_uuid, c->sb.set_uuid, 16))
+ return -ENOENT;
+
+ if (d->sb.block_size < c->sb.block_size) {
+ /* Will die */
+ err_printk("Couldn't attach %s: block size "
+ "less than set's block size\n", buf);
+ return -EINVAL;
+ }
+
+ u = uuid_find(c, d->sb.uuid);
+
+ if (u &&
+ (BDEV_STATE(&d->sb) == BDEV_STATE_STALE ||
+ BDEV_STATE(&d->sb) == BDEV_STATE_NONE)) {
+ memcpy(u->uuid, invalid_uuid, 16);
+ u->invalidated = cpu_to_le32(get_seconds());
+ u = NULL;
+ }
+
+ if (!u) {
+ if (BDEV_STATE(&d->sb) == BDEV_STATE_DIRTY) {
+ err_printk("Couldn't find uuid for %s in set\n", buf);
+ return -ENOENT;
+ }
+
+ u = uuid_find_empty(c);
+ if (!u) {
+ err_printk("Not caching %s, no room for UUID\n", buf);
+ return -EINVAL;
+ }
+ }
+
+ /* Deadlocks since we're called via sysfs...
+ sysfs_remove_file(&d->kobj, &sysfs_attach);
+ */
+
+ if (is_zero(u->uuid, 16)) {
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ memcpy(u->uuid, d->sb.uuid, 16);
+ memcpy(u->label, d->sb.label, SB_LABEL_SIZE);
+ u->first_reg = u->last_reg = rtime;
+ uuid_write(c);
+
+ memcpy(d->sb.set_uuid, c->sb.set_uuid, 16);
+ SET_BDEV_STATE(&d->sb, BDEV_STATE_CLEAN);
+
+ write_bdev_super(d, &cl);
+ closure_sync(&cl);
+ } else {
+ u->last_reg = rtime;
+ uuid_write(c);
+ }
+
+ bcache_device_attach(&d->disk, c, u - c->uuids);
+ bcache_device_link(&d->disk, c, "bdev");
+ list_move(&d->list, &c->cached_devs);
+ calc_cached_dev_sectors(c);
+
+ smp_wmb();
+ /* d->c must be set before d->count != 0 */
+ atomic_set(&d->count, 1);
+
+ if (BDEV_STATE(&d->sb) == BDEV_STATE_DIRTY) {
+ atomic_set(&d->has_dirty, 1);
+ atomic_inc(&d->count);
+ bcache_writeback_queue(d);
+ }
+
+ cached_dev_run(d);
+
+ printk(KERN_INFO "bcache: Caching %s as %s on set %pU\n",
+ bdevname(d->bdev, buf), d->disk.disk->disk_name,
+ d->disk.c->sb.set_uuid);
+ return 0;
+}
+
+static void __cached_dev_free(struct kobject *kobj)
+{
+ struct cached_dev *d = container_of(kobj, struct cached_dev, disk.kobj);
+ kfree(d);
+ module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+ struct cached_dev *d = container_of(cl, struct cached_dev, disk.cl);
+
+ /* XXX: background writeback could be in progress... */
+ cancel_delayed_work_sync(&d->refill_dirty);
+ cancel_delayed_work_sync(&d->read_dirty);
+ cancel_delayed_work_sync(&d->writeback_rate_update);
+
+ mutex_lock(®ister_lock);
+
+ bcache_device_free(&d->disk);
+ list_del(&d->list);
+
+ mutex_unlock(®ister_lock);
+
+ if (d->bio_passthrough)
+ mempool_destroy(d->bio_passthrough);
+
+ if (!IS_ERR_OR_NULL(d->bdev)) {
+ blk_sync_queue(bdev_get_queue(d->bdev));
+ blkdev_put(d->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ }
+
+ wake_up(&unregister_wait);
+
+ kobject_put(&d->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+ struct cached_dev *d = container_of(cl, struct cached_dev, disk.cl);
+
+ destroy_cache_accounting(&d->accounting);
+ kobject_del(&d->disk.kobj);
+
+ continue_at(cl, cached_dev_free, system_wq);
+}
+
+static struct cached_dev *cached_dev_alloc(unsigned block_size)
+{
+ struct cached_dev *d = kzalloc(sizeof(struct cached_dev), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ closure_init(&d->disk.cl, NULL);
+ set_closure_fn(&d->disk.cl, cached_dev_flush, system_wq);
+
+ __module_get(THIS_MODULE);
+ INIT_LIST_HEAD(&d->list);
+ cached_dev_kobject_init(d);
+ init_cache_accounting(&d->accounting, &d->disk.cl);
+
+ if (bcache_device_init(&d->disk, block_size))
+ goto err;
+
+ spin_lock_init(&d->dirty_lock);
+ spin_lock_init(&d->io_lock);
+ closure_init_unlocked(&d->sb_write);
+ INIT_WORK(&d->detach, cached_dev_detach_finish);
+
+ d->sequential_merge = true;
+ d->sequential_cutoff = 4 << 20;
+
+ INIT_LIST_HEAD(&d->io_lru);
+ d->sb_bio.bi_max_vecs = 1;
+ d->sb_bio.bi_io_vec = d->sb_bio.bi_inline_vecs;
+
+ for (struct io *j = d->io; j < d->io + RECENT_IO; j++) {
+ list_add(&j->lru, &d->io_lru);
+ hlist_add_head(&j->hash, d->io_hash + RECENT_IO);
+ }
+
+ bcache_writeback_init_cached_dev(d);
+
+ d->bio_passthrough = mempool_create_slab_pool(32, passthrough_cache);
+ if (!d->bio_passthrough)
+ goto err;
+
+ return d;
+err:
+ bcache_device_stop(&d->disk);
+ return NULL;
+}
+
+/* Cached device - bcache superblock */
+
+static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
+ struct block_device *bdev)
+{
+ char name[BDEVNAME_SIZE];
+ const char *err = "cannot allocate memory";
+ struct gendisk *g;
+ struct cache_set *c;
+
+ struct cached_dev *d = cached_dev_alloc(sb->block_size << 9);
+
+ if (!d)
+ return err;
+
+ memcpy(&d->sb, sb, sizeof(struct cache_sb));
+ d->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ d->bdev = bdev;
+ d->bdev->bd_holder = d;
+
+ g = d->disk.disk;
+
+ set_capacity(g, d->bdev->bd_part->nr_sects - 16);
+
+ cached_dev_request_init(d);
+
+ err = "error creating kobject";
+ if (kobject_add(&d->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
+ "bcache"))
+ goto err;
+ if (add_cache_accounting_kobjs(&d->accounting, &d->disk.kobj))
+ goto err;
+
+ list_add(&d->list, &uncached_devices);
+ list_for_each_entry(c, &cache_sets, list)
+ cached_dev_attach(d, c);
+
+ if (BDEV_STATE(&d->sb) == BDEV_STATE_NONE ||
+ BDEV_STATE(&d->sb) == BDEV_STATE_STALE)
+ cached_dev_run(d);
+
+ return NULL;
+err:
+ kobject_put(&d->disk.kobj);
+ printk(KERN_DEBUG "bcache: error opening %s: %s\n",
+ bdevname(bdev, name), err);
+ /*
+ * Return NULL instead of an error because kobject_put() cleans
+ * everything up
+ */
+ return NULL;
+}
+
+/* Flash only volumes */
+
+static void __flash_dev_free(struct kobject *kobj)
+{
+ struct bcache_device *d = container_of(kobj, struct bcache_device,
+ kobj);
+ kfree(d);
+}
+
+static void flash_dev_free(struct closure *cl)
+{
+ struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ bcache_device_free(d);
+ kobject_put(&d->kobj);
+}
+
+static void flash_dev_flush(struct closure *cl)
+{
+ struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+ kobject_del(&d->kobj);
+ continue_at(cl, flash_dev_free, system_wq);
+}
+
+static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
+{
+ struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+ GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ closure_init(&d->cl, NULL);
+ set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+
+ flash_dev_kobject_init(d);
+
+ if (bcache_device_init(d, block_bytes(c)))
+ goto err;
+
+ bcache_device_attach(d, c, u - c->uuids);
+ set_capacity(d->disk, u->sectors);
+ flash_dev_request_init(d);
+ add_disk(d->disk);
+
+ if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+ goto err;
+
+ bcache_device_link(d, c, "volume");
+
+ return 0;
+err:
+ kobject_put(&d->kobj);
+ return -ENOMEM;
+}
+
+static int flash_devs_run(struct cache_set *c)
+{
+ int ret = 0;
+
+ for (struct uuid_entry *u = c->uuids;
+ u < c->uuids + c->nr_uuids && !ret;
+ u++)
+ if (UUID_FLASH_ONLY(u))
+ ret = flash_dev_run(c, u);
+
+ return ret;
+}
+
+static int flash_dev_create(struct cache_set *c, uint64_t size)
+{
+ struct uuid_entry *u;
+
+ if (atomic_read(&c->closing))
+ return -EINTR;
+
+ u = uuid_find_empty(c);
+ if (!u) {
+ err_printk("Can't create volume, no room for UUID\n");
+ return -EINVAL;
+ }
+
+ get_random_bytes(u->uuid, 16);
+ memset(u->label, 0, 32);
+ u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+
+ SET_UUID_FLASH_ONLY(u, 1);
+ u->sectors = size >> 9;
+
+ uuid_write(c);
+
+ return flash_dev_run(c, u);
+}
+
+/* Cache set */
+
+bool cache_set_error(struct cache_set *c, const char *m, ...)
+{
+ va_list args;
+
+ if (atomic_read(&c->closing))
+ return false;
+
+ /* XXX: we can be called from atomic context
+ acquire_console_sem();
+ */
+
+ printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
+
+ va_start(args, m);
+ vprintk(m, args);
+ va_end(args);
+
+ printk(", disabling caching\n");
+
+ cache_set_unregister(c);
+ return true;
+}
+
+static void __cache_set_free(struct kobject *kobj)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+ kfree(c);
+ module_put(THIS_MODULE);
+}
+
+static void cache_set_free(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, cl);
+ struct cache *ca;
+
+ bcache_open_buckets_free(c);
+ bcache_btree_cache_free(c);
+ bcache_journal_free(c);
+
+ for_each_cache(ca, c)
+ if (ca)
+ kobject_put(&ca->kobj);
+
+ free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+ free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
+
+ kfree(c->fill_iter);
+ if (c->bio_split)
+ bioset_free(c->bio_split);
+ if (c->bio_meta)
+ mempool_destroy(c->bio_meta);
+ if (c->search)
+ mempool_destroy(c->search);
+ kfree(c->devices);
+
+ mutex_lock(®ister_lock);
+ list_del(&c->list);
+ mutex_unlock(®ister_lock);
+
+ printk(KERN_INFO "bcache: Cache set %pU unregistered\n",
+ c->sb.set_uuid);
+ wake_up(&unregister_wait);
+
+ closure_debug_destroy(&c->cl);
+ kobject_put(&c->kobj);
+}
+
+static void cache_set_flush(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, caching);
+ struct btree *b;
+
+ destroy_cache_accounting(&c->accounting);
+
+ kobject_put(&c->internal);
+ kobject_del(&c->kobj);
+
+ if (!IS_ERR_OR_NULL(c->root))
+ list_add(&c->root->list, &c->btree_cache);
+
+ /* Should skip this if we're unregistering because of an error */
+ list_for_each_entry(b, &c->btree_cache, list)
+ if (btree_node_dirty(b))
+ btree_write(b, true, NULL);
+
+ closure_return(cl);
+}
+
+static void __cache_set_unregister(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, caching);
+ struct cached_dev *d, *t;
+
+ mutex_lock(®ister_lock);
+
+ if (atomic_read(&c->unregistering))
+ list_for_each_entry_safe(d, t, &c->cached_devs, list)
+ cached_dev_detach(d);
+
+ for (size_t i = 0; i < c->nr_uuids; i++)
+ if (c->devices[i])
+ bcache_device_stop(c->devices[i]);
+
+ mutex_unlock(®ister_lock);
+
+ continue_at(cl, cache_set_flush, system_wq);
+}
+
+static void cache_set_stop(struct cache_set *c)
+{
+ if (!atomic_xchg(&c->closing, 1))
+ closure_queue(&c->caching);
+}
+
+static void cache_set_unregister(struct cache_set *c)
+{
+ atomic_set(&c->unregistering, 1);
+ cache_set_stop(c);
+}
+
+#define alloc_bucket_pages(gfp, c) \
+ ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+
+struct cache_set *cache_set_alloc(struct cache_sb *sb)
+{
+ int iter_size;
+ struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+ if (!c)
+ return NULL;
+
+ __module_get(THIS_MODULE);
+ closure_init(&c->cl, NULL);
+ set_closure_fn(&c->cl, cache_set_free, system_wq);
+
+ closure_init(&c->caching, &c->cl);
+ set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+
+ /* Maybe create continue_at_noreturn() and use it here? */
+ closure_set_stopped(&c->cl);
+ closure_put(&c->cl);
+
+ cache_set_kobject_init(c);
+ init_cache_accounting(&c->accounting, &c->cl);
+
+ memcpy(c->sb.set_uuid, sb->set_uuid, 16);
+ c->sb.block_size = sb->block_size;
+ c->sb.bucket_size = sb->bucket_size;
+ c->sb.nr_in_set = sb->nr_in_set;
+ c->sb.last_mount = sb->last_mount;
+ c->bucket_bits = ilog2(sb->bucket_size);
+ c->block_bits = ilog2(sb->block_size);
+ c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
+
+ c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
+ if (c->btree_pages > BTREE_MAX_PAGES)
+ c->btree_pages = max_t(int, c->btree_pages / 4,
+ BTREE_MAX_PAGES);
+
+ mutex_init(&c->bucket_lock);
+ mutex_init(&c->fill_lock);
+ mutex_init(&c->sort_lock);
+ spin_lock_init(&c->sort_time_lock);
+ closure_init_unlocked(&c->sb_write);
+ closure_init_unlocked(&c->uuid_write);
+ spin_lock_init(&c->btree_read_time_lock);
+
+ INIT_LIST_HEAD(&c->list);
+ INIT_LIST_HEAD(&c->cached_devs);
+ INIT_LIST_HEAD(&c->btree_cache);
+ INIT_LIST_HEAD(&c->btree_cache_freeable);
+ INIT_LIST_HEAD(&c->btree_cache_freed);
+ INIT_LIST_HEAD(&c->data_buckets);
+
+ c->search = mempool_create_slab_pool(32, search_cache);
+ if (!c->search)
+ goto err;
+
+ iter_size = (sb->bucket_size / sb->block_size + 1) *
+ sizeof(struct btree_iter_set);
+
+ if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
+ !(c->bio_meta = mempool_create_kmalloc_pool(2,
+ sizeof(struct bbio) + sizeof(struct bio_vec) *
+ bucket_pages(c))) ||
+ !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+ !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
+ !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
+ !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+ bcache_journal_alloc(c) ||
+ bcache_btree_cache_alloc(c) ||
+ bcache_open_buckets_alloc(c))
+ goto err;
+
+ c->fill_iter->size = sb->bucket_size / sb->block_size;
+
+ c->congested_read_threshold_us = 2000;
+ c->congested_write_threshold_us = 20000;
+ c->error_limit = 8 << IO_ERROR_SHIFT;
+
+ return c;
+err:
+ cache_set_unregister(c);
+ return NULL;
+}
+
+static void run_cache_set(struct cache_set *c)
+{
+ const char *err = "cannot allocate memory";
+ struct cached_dev *d, *t;
+ struct cache *ca;
+
+ struct btree_op op;
+ btree_op_init_stack(&op);
+ op.lock = SHRT_MAX;
+
+ for_each_cache(ca, c)
+ c->nbuckets += ca->sb.nbuckets;
+
+ if (CACHE_SYNC(&c->sb)) {
+ LIST_HEAD(journal);
+ struct bkey *k;
+ struct jset *j;
+
+ err = "cannot allocate memory for journal";
+ if (bcache_journal_read(c, &journal, &op))
+ goto err;
+
+ printk(KERN_DEBUG "bcache: btree_journal_read() done\n");
+
+ err = "no journal entries found";
+ if (list_empty(&journal))
+ goto err;
+
+ j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+ err = "IO error reading priorities";
+ for_each_cache(ca, c) {
+ if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
+ goto err;
+ }
+
+ k = &j->btree_root;
+
+ err = "bad btree root";
+ if (__ptr_invalid(c, j->btree_level + 1, k))
+ goto err;
+
+ err = "error reading btree root";
+ c->root = get_bucket(c, k, j->btree_level, &op);
+ if (IS_ERR_OR_NULL(c->root))
+ goto err;
+
+ list_del_init(&c->root->list);
+ rw_unlock(true, c->root);
+
+ err = uuid_read(c, j, &op.cl);
+ if (err)
+ goto err;
+
+ err = "error in recovery";
+ if (btree_check(c, &op))
+ goto err;
+
+ bcache_journal_mark(c, &journal);
+ btree_gc_finish(c);
+ printk(KERN_DEBUG "bcache: btree_check() done\n");
+
+ /*
+ * bcache_journal_next() can't happen sooner, or
+ * btree_gc_finish() will give spurious errors about last_gc >
+ * gc_gen - this is a hack but oh well.
+ */
+ bcache_journal_next(&c->journal);
+
+ /*
+ * First place it's safe to allocate: btree_check() and
+ * btree_gc_finish() have to run before we have buckets to
+ * allocate, and pop_bucket() might cause a journal entry to be
+ * written so bcache_journal_next() has to be called first
+ *
+ * If the uuids were in the old format we have to rewrite them
+ * before the next journal entry is written:
+ */
+ if (j->version < BCACHE_JSET_VERSION_UUID)
+ __uuid_write(c);
+
+ bcache_journal_replay(c, &journal, &op);
+ } else {
+ printk(KERN_NOTICE "bcache: invalidating existing data\n");
+ /* Don't want invalidate_buckets() to queue a gc yet */
+ closure_lock(&c->gc, NULL);
+
+ for_each_cache(ca, c) {
+ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+ 2, SB_JOURNAL_BUCKETS);
+
+ for (int i = 0; i < ca->sb.keys; i++)
+ ca->sb.d[i] = ca->sb.first_bucket + i;
+ }
+
+ btree_gc_finish(c);
+
+ err = "cannot allocate new UUID bucket";
+ if (uuid_write(c))
+ goto err_unlock_gc;
+
+ err = "cannot allocate new btree root";
+ c->root = bcache_btree_alloc(c, 0, &op.cl);
+ if (IS_ERR_OR_NULL(c->root))
+ goto err_unlock_gc;
+
+ bkey_copy_key(&c->root->key, &MAX_KEY);
+ btree_write(c->root, true, &op);
+
+ mutex_lock(&c->bucket_lock);
+ for_each_cache(ca, c) {
+ free_some_buckets(ca);
+ prio_write(ca);
+ }
+ mutex_unlock(&c->bucket_lock);
+
+ /*
+ * Wait for prio_write() to finish, so the SET_CACHE_SYNC()
+ * doesn't race
+ */
+ for_each_cache(ca, c)
+ closure_wait_event(&c->bucket_wait, &op.cl,
+ atomic_read(&ca->prio_written) == -1);
+
+ bcache_btree_set_root(c->root);
+ rw_unlock(true, c->root);
+
+ /*
+ * We don't want to write the first journal entry until
+ * everything is set up - fortunately journal entries won't be
+ * written until the SET_CACHE_SYNC() here:
+ */
+ SET_CACHE_SYNC(&c->sb, true);
+
+ bcache_journal_next(&c->journal);
+ bcache_journal_meta(c, &op.cl);
+
+ /* Unlock */
+ closure_set_stopped(&c->gc.cl);
+ closure_put(&c->gc.cl);
+ }
+
+ closure_sync(&op.cl);
+ c->sb.last_mount = get_seconds();
+ bcache_write_super(c);
+
+ list_for_each_entry_safe(d, t, &uncached_devices, list)
+ cached_dev_attach(d, c);
+
+ flash_devs_run(c);
+
+ return;
+err_unlock_gc:
+ closure_set_stopped(&c->gc.cl);
+ closure_put(&c->gc.cl);
+err:
+ closure_sync(&op.cl);
+ /* XXX: test this, it's broken */
+ cache_set_error(c, err);
+}
+
+static bool can_attach_cache(struct cache *ca, struct cache_set *c)
+{
+ return ca->sb.block_size == c->sb.block_size &&
+ ca->sb.bucket_size == c->sb.block_size &&
+ ca->sb.nr_in_set == c->sb.nr_in_set;
+}
+
+static const char *register_cache_set(struct cache *ca)
+{
+ char buf[12];
+ const char *err = "cannot allocate memory";
+ struct cache_set *c;
+
+ list_for_each_entry(c, &cache_sets, list)
+ if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
+ if (c->cache[ca->sb.nr_this_dev])
+ return "duplicate cache set member";
+
+ if (!can_attach_cache(ca, c))
+ return "cache sb does not match set";
+
+ if (!CACHE_SYNC(&ca->sb))
+ SET_CACHE_SYNC(&c->sb, false);
+
+ goto found;
+ }
+
+ c = cache_set_alloc(&ca->sb);
+ if (!c)
+ return err;
+
+ err = "error creating kobject";
+ if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+ kobject_add(&c->internal, &c->kobj, "internal"))
+ goto err;
+
+ if (add_cache_accounting_kobjs(&c->accounting, &c->kobj))
+ goto err;
+
+ list_add(&c->list, &cache_sets);
+found:
+ sprintf(buf, "cache%i", ca->sb.nr_this_dev);
+ if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
+ sysfs_create_link(&c->kobj, &ca->kobj, buf))
+ goto err;
+
+ if (ca->sb.seq > c->sb.seq) {
+ c->sb.version = ca->sb.version;
+ memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
+ c->sb.flags = ca->sb.flags;
+ c->sb.seq = ca->sb.seq;
+ pr_debug("set version = %llu", c->sb.version);
+ }
+
+ ca->set = c;
+ ca->set->cache[ca->sb.nr_this_dev] = ca;
+ c->cache_by_alloc[c->caches_loaded++] = ca;
+
+ if (c->caches_loaded == c->sb.nr_in_set)
+ run_cache_set(c);
+
+ return NULL;
+err:
+ cache_set_unregister(c);
+ return err;
+}
+
+/* Cache device */
+
+static void cache_free(struct kobject *kobj)
+{
+ struct cache *c = container_of(kobj, struct cache, kobj);
+
+ if (c->set)
+ c->set->cache[c->sb.nr_this_dev] = NULL;
+
+ if (!IS_ERR_OR_NULL(c->debug))
+ debugfs_remove(c->debug);
+
+ free_discards(c);
+
+ if (c->prio_bio)
+ bio_put(c->prio_bio);
+ if (c->uuid_bio)
+ bio_put(c->uuid_bio);
+
+ free_pages((unsigned long) c->disk_buckets, ilog2(bucket_pages(c)));
+ kfree(c->prio_buckets);
+ vfree(c->buckets);
+
+ if (c->discard_page)
+ put_page(c->discard_page);
+
+ free_heap(&c->heap);
+ free_fifo(&c->unused);
+ free_fifo(&c->free_inc);
+ free_fifo(&c->free);
+
+ if (c->sb_bio.bi_inline_vecs[0].bv_page)
+ put_page(c->sb_bio.bi_io_vec[0].bv_page);
+
+ if (!IS_ERR_OR_NULL(c->bdev)) {
+ blk_sync_queue(bdev_get_queue(c->bdev));
+ blkdev_put(c->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ }
+
+ kfree(c);
+ module_put(THIS_MODULE);
+}
+
+static struct cache *cache_alloc(struct cache_sb *sb)
+{
+ size_t free;
+ struct bucket *b;
+ struct cache *c = kzalloc(sizeof(struct cache), GFP_KERNEL);
+ if (!c)
+ return NULL;
+
+ __module_get(THIS_MODULE);
+ cache_kobject_init(c);
+
+ memcpy(&c->sb, sb, sizeof(struct cache_sb));
+
+ INIT_LIST_HEAD(&c->discards);
+
+ bio_init(&c->sb_bio);
+ c->sb_bio.bi_max_vecs = 1;
+ c->sb_bio.bi_io_vec = c->sb_bio.bi_inline_vecs;
+
+ bio_init(&c->journal.bio);
+ c->journal.bio.bi_max_vecs = 8;
+ c->journal.bio.bi_io_vec = c->journal.bio.bi_inline_vecs;
+
+ free = roundup_pow_of_two(c->sb.nbuckets) >> 9;
+ free = max_t(size_t, free, 16);
+ free = max_t(size_t, free, prio_buckets(c) + 4);
+
+ if (!init_fifo(&c->free, free, GFP_KERNEL) ||
+ !init_fifo(&c->free_inc, free << 2, GFP_KERNEL) ||
+ !init_fifo(&c->unused, free << 2, GFP_KERNEL) ||
+ !init_heap(&c->heap, free << 3, GFP_KERNEL) ||
+ !(c->discard_page = alloc_page(__GFP_ZERO|GFP_KERNEL)) ||
+ !(c->buckets = vmalloc(sizeof(struct bucket) *
+ c->sb.nbuckets)) ||
+ !(c->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(c) *
+ 2, GFP_KERNEL)) ||
+ !(c->disk_buckets = alloc_bucket_pages(GFP_KERNEL, c)) ||
+ !(c->uuid_bio = bbio_kmalloc(GFP_KERNEL, bucket_pages(c))) ||
+ !(c->prio_bio = bio_kmalloc(GFP_KERNEL, bucket_pages(c))))
+ goto err;
+
+ c->prio_next = c->prio_buckets + prio_buckets(c);
+
+ memset(c->buckets, 0, c->sb.nbuckets * sizeof(struct bucket));
+ for_each_bucket(b, c)
+ atomic_set(&b->pin, 0);
+
+ if (alloc_discards(c))
+ goto err;
+
+ return c;
+err:
+ kobject_put(&c->kobj);
+ return NULL;
+}
+
+static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
+ struct block_device *bdev)
+{
+ char name[BDEVNAME_SIZE];
+ const char *err = "cannot allocate memory";
+ struct cache *c = cache_alloc(sb);
+ if (!c)
+ return err;
+
+ c->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ c->bdev = bdev;
+ c->bdev->bd_holder = c;
+
+ if (blk_queue_discard(bdev_get_queue(c->bdev)))
+ c->discard = CACHE_DISCARD(&c->sb);
+
+ err = "error creating kobject";
+ if (kobject_add(&c->kobj, &disk_to_dev(bdev->bd_disk)->kobj, "bcache"))
+ goto err;
+
+ err = register_cache_set(c);
+ if (err)
+ goto err;
+
+ bcache_debug_init_cache(c);
+
+ printk(KERN_DEBUG "bcache: registered cache device %s\n",
+ bdevname(bdev, name));
+
+ return NULL;
+err:
+ kobject_put(&c->kobj);
+ printk(KERN_DEBUG "bcache: error opening %s: %s\n",
+ bdevname(bdev, name), err);
+ /* Return NULL instead of an error because kobject_put() cleans
+ * everything up
+ */
+ return NULL;
+}
+
+/* Global interfaces/init */
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+ const char *, size_t);
+
+kobj_attribute_write(register, register_bcache);
+kobj_attribute_write(register_quiet, register_bcache);
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size)
+{
+ ssize_t ret = size;
+ const char *err = "cannot allocate memory";
+ char *path = NULL;
+ struct cache_sb *sb = NULL;
+ struct block_device *bdev = NULL;
+ struct page *sb_page = NULL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ mutex_lock(®ister_lock);
+
+ if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
+ !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+ goto err;
+
+ err = "failed to open device";
+ bdev = blkdev_get_by_path(strim(path),
+ FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ sb);
+ if (bdev == ERR_PTR(-EBUSY))
+ err = "device busy";
+
+ if (IS_ERR(bdev) ||
+ set_blocksize(bdev, 4096))
+ goto err;
+
+ err = read_super(sb, bdev, &sb_page);
+ if (err)
+ goto err_close;
+
+ if (sb->version == CACHE_BACKING_DEV)
+ err = register_bdev(sb, sb_page, bdev);
+ else
+ err = register_cache(sb, sb_page, bdev);
+
+ if (err) {
+ /* register_(bdev|cache) will only return an error if they
+ * didn't get far enough to create the kobject - if they did,
+ * the kobject destructor will do this cleanup.
+ */
+ put_page(sb_page);
+err_close:
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+err:
+ if (attr != &ksysfs_register_quiet)
+ printk(KERN_DEBUG "bcache: error opening %s: %s\n",
+ path, err);
+ ret = -EINVAL;
+ }
+
+ kfree(sb);
+ kfree(path);
+ mutex_unlock(®ister_lock);
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+ if (code == SYS_DOWN ||
+ code == SYS_HALT ||
+ code == SYS_POWER_OFF) {
+ DEFINE_WAIT(wait);
+ unsigned long start = jiffies;
+ bool stopped = false;
+
+ struct cache_set *c, *tc;
+ struct cached_dev *dc, *tdc;
+
+ mutex_lock(®ister_lock);
+
+ if (list_empty(&cache_sets) && list_empty(&uncached_devices))
+ goto out;
+
+ printk(KERN_INFO "bcache: Stopping all devices:\n");
+
+ list_for_each_entry_safe(c, tc, &cache_sets, list)
+ cache_set_stop(c);
+
+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
+ bcache_device_stop(&dc->disk);
+
+ /* What's a condition variable? */
+ while (1) {
+ long timeout = start + 2 * HZ - jiffies;
+
+ stopped = list_empty(&cache_sets) &&
+ list_empty(&uncached_devices);
+
+ if (timeout < 0 || stopped)
+ break;
+
+ prepare_to_wait(&unregister_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_unlock(®ister_lock);
+ schedule_timeout(timeout);
+ mutex_lock(®ister_lock);
+ }
+
+ finish_wait(&unregister_wait, &wait);
+
+ printk(KERN_INFO "bcache: %s\n", stopped
+ ? "All devices stopped"
+ : "Timeout waiting for devices to be closed");
+out:
+ mutex_unlock(®ister_lock);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+ .notifier_call = bcache_reboot,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void bcache_exit(void)
+{
+ bcache_debug_exit();
+ bcache_writeback_exit();
+ bcache_request_exit();
+ bcache_btree_exit();
+ if (bcache_kobj)
+ kobject_put(bcache_kobj);
+ if (bcache_wq)
+ destroy_workqueue(bcache_wq);
+ unregister_blkdev(bcache_major, "bcache");
+ unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+ static const struct attribute *files[] = {
+ &ksysfs_register.attr,
+ &ksysfs_register_quiet.attr,
+ NULL
+ };
+
+ mutex_init(®ister_lock);
+ init_waitqueue_head(&unregister_wait);
+ register_reboot_notifier(&reboot);
+
+ bcache_major = register_blkdev(0, "bcache");
+ if (bcache_major < 0)
+ return bcache_major;
+
+ if (!(bcache_wq = create_workqueue("bcache")) ||
+ !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
+ sysfs_create_files(bcache_kobj, files) ||
+ bcache_btree_init() ||
+ bcache_request_init() ||
+ bcache_writeback_init() ||
+ bcache_debug_init(bcache_kobj))
+ goto err;
+
+ return 0;
+err:
+ bcache_exit();
+ return -ENOMEM;
+}
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/drivers/block/bcache/sysfs.c b/drivers/block/bcache/sysfs.c
new file mode 100644
index 0000000..3ead3ba
--- /dev/null
+++ b/drivers/block/bcache/sysfs.c
@@ -0,0 +1,802 @@
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(flash_vol_create);
+
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(priority_stats);
+read_attribute(btree_cache_size);
+read_attribute(btree_cache_max_chain);
+read_attribute(cache_available_percent);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(active_journal_entries);
+
+sysfs_time_stats_attribute(btree_gc, sec, ms);
+sysfs_time_stats_attribute(btree_split, sec, us);
+sysfs_time_stats_attribute(btree_sort, ms, us);
+sysfs_time_stats_attribute(btree_read, ms, us);
+sysfs_time_stats_attribute(try_harder, ms, us);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(dirty_data);
+read_attribute(bset_tree_stats);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(sequential_merge);
+rw_attribute(data_csum);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+rw_attribute(writeback_delay);
+rw_attribute(writeback_rate);
+
+rw_attribute(writeback_rate_update_seconds);
+rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_d_smooth);
+read_attribute(writeback_rate_debug);
+
+rw_attribute(synchronous);
+rw_attribute(journal_delay_ms);
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+rw_attribute(verify);
+rw_attribute(key_merging_disabled);
+rw_attribute(gc_always_rewrite);
+rw_attribute(freelist_percent);
+rw_attribute(cache_replacement_policy);
+rw_attribute(btree_shrinker_disabled);
+rw_attribute(size);
+
+static void unregister_fake(struct kobject *k)
+{
+}
+
+SHOW(__cached_dev)
+{
+ struct cached_dev *d = container_of(kobj, struct cached_dev, disk.kobj);
+ const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat) (d->stat)
+
+ if (attr == &sysfs_cache_mode)
+ return sprint_string_list(buf, bcache_cache_modes + 1,
+ BDEV_CACHE_MODE(&d->sb));
+
+ sysfs_printf(data_csum, "%i", d->disk.data_csum);
+ var_printf(verify, "%i");
+ var_printf(writeback_metadata, "%i");
+ var_printf(writeback_running, "%i");
+ var_print(writeback_delay);
+ var_print(writeback_percent);
+ var_print(writeback_rate);
+
+ var_print(writeback_rate_update_seconds);
+ var_print(writeback_rate_d_term);
+ var_print(writeback_rate_p_term_inverse);
+ var_print(writeback_rate_d_smooth);
+
+ if (attr == &sysfs_writeback_rate_debug) {
+ char dirty[20];
+ char derivative[20];
+ char target[20];
+ hprint(dirty,
+ atomic_long_read(&d->disk.sectors_dirty) << 9);
+ hprint(derivative, d->writeback_rate_derivative << 9);
+ hprint(target, d->writeback_rate_target << 9);
+
+ return sprintf(buf,
+ "rate:\t\t%u\n"
+ "change:\t\t%i\n"
+ "dirty:\t\t%s\n"
+ "derivative:\t%s\n"
+ "target:\t\t%s\n",
+ d->writeback_rate,
+ d->writeback_rate_change,
+ dirty, derivative, target);
+ }
+
+ sysfs_hprint(dirty_data,
+ atomic_long_read(&d->disk.sectors_dirty) << 9);
+
+ var_printf(sequential_merge, "%i");
+ var_hprint(sequential_cutoff);
+ var_hprint(readahead);
+
+ sysfs_print(running, atomic_read(&d->running));
+ sysfs_print(state, states[BDEV_STATE(&d->sb)]);
+
+ if (attr == &sysfs_label) {
+ memcpy(buf, d->sb.label, SB_LABEL_SIZE);
+ buf[SB_LABEL_SIZE + 1] = '\0';
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
+#undef var
+ return 0;
+}
+SHOW_LOCKED(cached_dev)
+
+STORE(__cached_dev)
+{
+ struct cached_dev *d = container_of(kobj, struct cached_dev, disk.kobj);
+ unsigned v = size;
+ struct cache_set *c;
+
+#define d_strtoul(var) sysfs_strtoul(var, d->var)
+#define d_strtoi_h(var) sysfs_hatoi(var, d->var)
+
+ sysfs_strtoul(data_csum, d->disk.data_csum);
+ d_strtoul(verify);
+ d_strtoul(writeback_metadata);
+ d_strtoul(writeback_running);
+ d_strtoul(writeback_delay);
+ sysfs_strtoul_clamp(writeback_rate, d->writeback_rate, 1, 1000000);
+ sysfs_strtoul_clamp(writeback_percent, d->writeback_percent, 0, 40);
+
+ d_strtoul(writeback_rate_update_seconds);
+ d_strtoul(writeback_rate_d_term);
+ d_strtoul(writeback_rate_p_term_inverse);
+ sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
+ d->writeback_rate_p_term_inverse, 1, INT_MAX);
+ d_strtoul(writeback_rate_d_smooth);
+
+ d_strtoul(sequential_merge);
+ d_strtoi_h(sequential_cutoff);
+ d_strtoi_h(readahead);
+
+ if (attr == &sysfs_clear_stats)
+ clear_stats(&d->accounting);
+
+ if (attr == &sysfs_running &&
+ strtoul_or_return(buf))
+ cached_dev_run(d);
+
+ if (attr == &sysfs_cache_mode) {
+ ssize_t v = read_string_list(buf, bcache_cache_modes + 1);
+
+ if (v < 0)
+ return v;
+
+ if ((unsigned) v != BDEV_CACHE_MODE(&d->sb)) {
+ SET_BDEV_CACHE_MODE(&d->sb, v);
+ write_bdev_super(d, NULL);
+ }
+ }
+
+ if (attr == &sysfs_label) {
+ memcpy(d->sb.label, buf, SB_LABEL_SIZE);
+ write_bdev_super(d, NULL);
+ if (d->disk.c) {
+ memcpy(d->disk.c->uuids[d->disk.id].label,
+ buf, SB_LABEL_SIZE);
+ uuid_write(d->disk.c);
+ }
+ }
+
+ if (attr == &sysfs_attach) {
+ if (parse_uuid(buf, d->sb.set_uuid) < 16)
+ return -EINVAL;
+
+ list_for_each_entry(c, &cache_sets, list) {
+ v = cached_dev_attach(d, c);
+ if (!v)
+ return size;
+ }
+ size = v;
+ }
+
+ if (attr == &sysfs_detach && d->disk.c)
+ cached_dev_detach(d);
+
+ if (attr == &sysfs_stop)
+ bcache_device_stop(&d->disk);
+
+ return size;
+}
+
+STORE(cached_dev)
+{
+ struct cached_dev *dc = container_of(kobj, struct cached_dev,
+ disk.kobj);
+
+ mutex_lock(®ister_lock);
+ size = __cached_dev_store(kobj, attr, buf, size);
+
+ if (attr == &sysfs_writeback_running)
+ bcache_writeback_queue(dc);
+
+ if (attr == &sysfs_writeback_percent)
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+
+ mutex_unlock(®ister_lock);
+ return size;
+}
+
+static void cached_dev_kobject_init(struct cached_dev *dc)
+{
+ static struct attribute *cached_dev_files[] = {
+ &sysfs_attach,
+ &sysfs_detach,
+ &sysfs_stop,
+#if 0
+ &sysfs_data_csum,
+#endif
+ &sysfs_cache_mode,
+ &sysfs_writeback_metadata,
+ &sysfs_writeback_running,
+ &sysfs_writeback_delay,
+ &sysfs_writeback_percent,
+ &sysfs_writeback_rate,
+ &sysfs_writeback_rate_update_seconds,
+ &sysfs_writeback_rate_d_term,
+ &sysfs_writeback_rate_p_term_inverse,
+ &sysfs_writeback_rate_d_smooth,
+ &sysfs_writeback_rate_debug,
+ &sysfs_dirty_data,
+ &sysfs_sequential_cutoff,
+ &sysfs_sequential_merge,
+ &sysfs_clear_stats,
+ &sysfs_running,
+ &sysfs_state,
+ &sysfs_label,
+ &sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+ &sysfs_verify,
+#endif
+ NULL
+ };
+ KTYPE(cached_dev, __cached_dev_free);
+
+ kobject_init(&dc->disk.kobj, &cached_dev_obj);
+}
+
+SHOW(flash_dev)
+{
+ struct bcache_device *d = container_of(kobj, struct bcache_device,
+ kobj);
+ struct uuid_entry *u = &d->c->uuids[d->id];
+
+ sysfs_printf(data_csum, "%i", d->data_csum);
+ sysfs_hprint(size, u->sectors << 9);
+
+ if (attr == &sysfs_label) {
+ memcpy(buf, u->label, SB_LABEL_SIZE);
+ buf[SB_LABEL_SIZE + 1] = '\0';
+ strcat(buf, "\n");
+ return strlen(buf);
+ }
+
+ return 0;
+}
+
+STORE(__flash_dev)
+{
+ struct bcache_device *d = container_of(kobj, struct bcache_device,
+ kobj);
+ struct uuid_entry *u = &d->c->uuids[d->id];
+
+ sysfs_strtoul(data_csum, d->data_csum);
+
+ if (attr == &sysfs_size) {
+ uint64_t v;
+ strtoi_h_or_return(buf, v);
+
+ u->sectors = v >> 9;
+ uuid_write(d->c);
+ set_capacity(d->disk, u->sectors);
+ }
+
+ if (attr == &sysfs_label) {
+ memcpy(u->label, buf, SB_LABEL_SIZE);
+ uuid_write(d->c);
+ }
+
+ if (attr == &sysfs_unregister) {
+ atomic_set(&d->detaching, 1);
+ bcache_device_stop(d);
+ }
+
+ return size;
+}
+STORE_LOCKED(flash_dev)
+
+static void flash_dev_kobject_init(struct bcache_device *d)
+{
+ static struct attribute *flash_dev_files[] = {
+ &sysfs_unregister,
+#if 0
+ &sysfs_data_csum,
+#endif
+ &sysfs_label,
+ &sysfs_size,
+ NULL
+ };
+ KTYPE(flash_dev, __flash_dev_free);
+
+ kobject_init(&d->kobj, &flash_dev_obj);
+}
+
+SHOW(__cache_set)
+{
+ unsigned root_usage(struct cache_set *c)
+ {
+ unsigned bytes = 0;
+ struct bkey *k;
+ struct btree *b;
+ goto lock_root;
+
+ do {
+ rw_unlock(false, b);
+lock_root:
+ b = c->root;
+ rw_lock(false, b, b->level);
+ } while (b != c->root);
+
+ for_each_key_filter(b, k, ptr_bad)
+ bytes += bkey_bytes(k);
+
+ rw_unlock(false, b);
+
+ return (bytes * 100) / btree_bytes(c);
+ }
+
+ size_t cache_size(struct cache_set *c)
+ {
+ size_t ret = 0;
+ struct btree *b;
+
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry(b, &c->btree_cache, list)
+ ret += 1 << (b->page_order + PAGE_SHIFT);
+
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+ }
+
+ unsigned cache_max_chain(struct cache_set *c)
+ {
+ unsigned ret = 0;
+ mutex_lock(&c->bucket_lock);
+
+ for (struct hlist_head *h = c->bucket_hash;
+ h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+ h++) {
+ unsigned i = 0;
+ struct hlist_node *p;
+
+ hlist_for_each(p, h)
+ i++;
+
+ ret = max(ret, i);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+ }
+
+ unsigned btree_used(struct cache_set *c)
+ {
+ return div64_u64(c->gc_stats.key_bytes * 100,
+ (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+ }
+
+ unsigned average_key_size(struct cache_set *c)
+ {
+ return c->gc_stats.nkeys
+ ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+ : 0;
+ }
+
+ struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+ sysfs_print(synchronous, CACHE_SYNC(&c->sb));
+ sysfs_print(journal_delay_ms, c->journal_delay_ms);
+ sysfs_hprint(bucket_size, bucket_bytes(c));
+ sysfs_hprint(block_size, block_bytes(c));
+ sysfs_print(tree_depth, c->root->level);
+ sysfs_print(root_usage_percent, root_usage(c));
+
+ sysfs_hprint(btree_cache_size, cache_size(c));
+ sysfs_print(btree_cache_max_chain, cache_max_chain(c));
+ sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
+
+ sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
+ sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
+ sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us);
+ sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
+ sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
+
+ sysfs_print(btree_used_percent, btree_used(c));
+ sysfs_print(btree_nodes, c->gc_stats.nodes);
+ sysfs_hprint(dirty_data, c->gc_stats.dirty);
+ sysfs_hprint(average_key_size, average_key_size(c));
+
+ sysfs_print(cache_read_races,
+ atomic_long_read(&c->cache_read_races));
+
+ sysfs_print(writeback_keys_done,
+ atomic_long_read(&c->writeback_keys_done));
+ sysfs_print(writeback_keys_failed,
+ atomic_long_read(&c->writeback_keys_failed));
+
+ /* See count_io_errors for why 88 */
+ sysfs_print(io_error_halflife, c->error_decay * 88);
+ sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
+
+ sysfs_hprint(congested,
+ ((uint64_t) bcache_get_congested(c)) << 9);
+ sysfs_print(congested_read_threshold_us,
+ c->congested_read_threshold_us);
+ sysfs_print(congested_write_threshold_us,
+ c->congested_write_threshold_us);
+
+ sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
+ sysfs_printf(verify, "%i", c->verify);
+ sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
+ sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
+ sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
+
+ if (attr == &sysfs_bset_tree_stats)
+ return bset_print_stats(c, buf);
+
+ return 0;
+}
+SHOW_LOCKED(cache_set)
+
+STORE(__cache_set)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+ if (attr == &sysfs_unregister)
+ cache_set_unregister(c);
+
+ if (attr == &sysfs_stop)
+ cache_set_stop(c);
+
+ if (attr == &sysfs_synchronous) {
+ bool sync = strtoul_or_return(buf);
+
+ if (sync != CACHE_SYNC(&c->sb)) {
+ SET_CACHE_SYNC(&c->sb, sync);
+ bcache_write_super(c);
+ }
+ }
+
+ if (attr == &sysfs_flash_vol_create) {
+ int r;
+ uint64_t v;
+ strtoi_h_or_return(buf, v);
+
+ r = flash_dev_create(c, v);
+ if (r)
+ return r;
+ }
+
+ if (attr == &sysfs_clear_stats) {
+ atomic_long_set(&c->writeback_keys_done, 0);
+ atomic_long_set(&c->writeback_keys_failed, 0);
+
+ memset(&c->gc_stats, 0, sizeof(struct gc_stat));
+ clear_stats(&c->accounting);
+ }
+
+ if (attr == &sysfs_trigger_gc)
+ bcache_queue_gc(c);
+
+ if (attr == &sysfs_prune_cache) {
+ struct shrink_control sc;
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->shrink.shrink(&c->shrink, &sc);
+ }
+
+ sysfs_strtoul(congested_read_threshold_us,
+ c->congested_read_threshold_us);
+ sysfs_strtoul(congested_write_threshold_us,
+ c->congested_write_threshold_us);
+
+ if (attr == &sysfs_io_error_limit)
+ c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+
+ /* See count_io_errors() for why 88 */
+ if (attr == &sysfs_io_error_halflife)
+ c->error_decay = strtoul_or_return(buf) / 88;
+
+ sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
+ sysfs_strtoul(verify, c->verify);
+ sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
+ sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
+ sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
+
+ return size;
+}
+STORE_LOCKED(cache_set)
+
+SHOW(cache_set_internal)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ return cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(cache_set_internal)
+{
+ struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ return cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void cache_set_kobject_init(struct cache_set *c)
+{
+ static struct attribute *cache_set_files[] = {
+ &sysfs_unregister,
+ &sysfs_stop,
+ &sysfs_synchronous,
+ &sysfs_journal_delay_ms,
+ &sysfs_flash_vol_create,
+
+ &sysfs_bucket_size,
+ &sysfs_block_size,
+ &sysfs_tree_depth,
+ &sysfs_root_usage_percent,
+ &sysfs_btree_cache_size,
+ &sysfs_cache_available_percent,
+
+ &sysfs_average_key_size,
+ &sysfs_dirty_data,
+
+ &sysfs_io_error_limit,
+ &sysfs_io_error_halflife,
+ &sysfs_congested,
+ &sysfs_congested_read_threshold_us,
+ &sysfs_congested_write_threshold_us,
+ &sysfs_clear_stats,
+ NULL
+ };
+ KTYPE(cache_set, __cache_set_free);
+
+ static struct attribute *cache_set_internal_files[] = {
+ &sysfs_active_journal_entries,
+
+ sysfs_time_stats_attribute_list(btree_gc, sec, ms)
+ sysfs_time_stats_attribute_list(btree_split, sec, us)
+ sysfs_time_stats_attribute_list(btree_sort, ms, us)
+ sysfs_time_stats_attribute_list(btree_read, ms, us)
+ sysfs_time_stats_attribute_list(try_harder, ms, us)
+
+ &sysfs_btree_nodes,
+ &sysfs_btree_used_percent,
+ &sysfs_btree_cache_max_chain,
+
+ &sysfs_bset_tree_stats,
+ &sysfs_cache_read_races,
+ &sysfs_writeback_keys_done,
+ &sysfs_writeback_keys_failed,
+
+ &sysfs_trigger_gc,
+ &sysfs_prune_cache,
+#ifdef CONFIG_BCACHE_DEBUG
+ &sysfs_verify,
+ &sysfs_key_merging_disabled,
+#endif
+ &sysfs_gc_always_rewrite,
+ &sysfs_btree_shrinker_disabled,
+ NULL
+ };
+ KTYPE(cache_set_internal, unregister_fake);
+
+ kobject_init(&c->kobj, &cache_set_obj);
+ kobject_init(&c->internal, &cache_set_internal_obj);
+}
+
+SHOW(__cache)
+{
+ struct cache *c = container_of(kobj, struct cache, kobj);
+
+ sysfs_hprint(bucket_size, bucket_bytes(c));
+ sysfs_hprint(block_size, block_bytes(c));
+ sysfs_print(nbuckets, c->sb.nbuckets);
+ sysfs_print(discard, c->discard);
+ sysfs_hprint(written, atomic_long_read(&c->sectors_written) << 9);
+ sysfs_hprint(btree_written,
+ atomic_long_read(&c->btree_sectors_written) << 9);
+ sysfs_hprint(metadata_written,
+ (atomic_long_read(&c->meta_sectors_written) +
+ atomic_long_read(&c->btree_sectors_written)) << 9);
+
+ sysfs_print(io_errors,
+ atomic_read(&c->io_errors) >> IO_ERROR_SHIFT);
+
+ sysfs_print(freelist_percent, c->free.size * 100 /
+ ((size_t) c->sb.nbuckets));
+
+ if (attr == &sysfs_cache_replacement_policy)
+ return sprint_string_list(buf, cache_replacement_policies,
+ CACHE_REPLACEMENT(&c->sb));
+
+ if (attr == &sysfs_priority_stats) {
+ int cmp(const void *l, const void *r)
+ { return *((uint16_t *) r) - *((uint16_t *) l); }
+
+ /* Number of quantiles we compute */
+ const unsigned nq = 31;
+
+ size_t n = c->sb.nbuckets, i, unused, btree;
+ uint64_t sum = 0;
+ uint16_t q[nq], *p, *cached;
+ ssize_t ret;
+
+ cached = p = vmalloc(c->sb.nbuckets * sizeof(uint16_t));
+ if (!p)
+ return -ENOMEM;
+
+ mutex_lock(&c->set->bucket_lock);
+ for (i = c->sb.first_bucket; i < n; i++)
+ p[i] = c->buckets[i].prio;
+ mutex_unlock(&c->set->bucket_lock);
+
+ sort(p, n, sizeof(uint16_t), cmp, NULL);
+
+ while (n &&
+ !cached[n - 1])
+ --n;
+
+ unused = c->sb.nbuckets - n;
+
+ while (cached < p + n &&
+ *cached == btree_prio)
+ cached++;
+
+ btree = cached - p;
+ n -= btree;
+
+ for (i = 0; i < n; i++)
+ sum += initial_prio - cached[i];
+
+ if (n)
+ do_div(sum, n);
+
+ for (i = 0; i < nq; i++)
+ q[i] = initial_prio - cached[n * (i + 1) / (nq + 1)];
+
+ vfree(p);
+
+ ret = snprintf(buf, PAGE_SIZE,
+ "Unused: %zu%%\n"
+ "Metadata: %zu%%\n"
+ "Average: %llu\n"
+ "Sectors per Q: %zu\n"
+ "Quantiles: [",
+ unused * 100 / (size_t) c->sb.nbuckets,
+ btree * 100 / (size_t) c->sb.nbuckets, sum,
+ n * c->sb.bucket_size / (nq + 1));
+
+ for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
+ ret += snprintf(buf + ret, PAGE_SIZE - ret,
+ i < nq - 1 ? "%u " : "%u]\n", q[i]);
+
+ buf[PAGE_SIZE - 1] = '\0';
+ return ret;
+ }
+
+ return 0;
+}
+SHOW_LOCKED(cache)
+
+STORE(__cache)
+{
+ struct cache *c = container_of(kobj, struct cache, kobj);
+
+ if (attr == &sysfs_discard) {
+ bool v = strtoul_or_return(buf);
+
+ if (blk_queue_discard(bdev_get_queue(c->bdev)))
+ c->discard = v;
+
+ if (v != CACHE_DISCARD(&c->sb)) {
+ SET_CACHE_DISCARD(&c->sb, v);
+ bcache_write_super(c->set);
+ }
+ }
+
+ if (attr == &sysfs_cache_replacement_policy) {
+ ssize_t v = read_string_list(buf, cache_replacement_policies);
+
+ if (v < 0)
+ return v;
+
+ if ((unsigned) v != CACHE_REPLACEMENT(&c->sb)) {
+ mutex_lock(&c->set->bucket_lock);
+ SET_CACHE_REPLACEMENT(&c->sb, v);
+ mutex_unlock(&c->set->bucket_lock);
+
+ bcache_write_super(c->set);
+ }
+ }
+
+ if (attr == &sysfs_freelist_percent) {
+ DECLARE_FIFO(long, free);
+ long i;
+ size_t p = strtoul_or_return(buf);
+
+ p = clamp_t(size_t,
+ ((size_t) c->sb.nbuckets * p) / 100,
+ roundup_pow_of_two(c->sb.nbuckets) >> 9,
+ c->sb.nbuckets / 2);
+
+ if (!init_fifo_exact(&free, p, GFP_KERNEL))
+ return -ENOMEM;
+
+ mutex_lock(&c->set->bucket_lock);
+
+ fifo_move(&free, &c->free);
+ fifo_swap(&free, &c->free);
+
+ mutex_unlock(&c->set->bucket_lock);
+
+ while (fifo_pop(&free, i))
+ atomic_dec(&c->buckets[i].pin);
+
+ free_fifo(&free);
+ }
+
+ if (attr == &sysfs_clear_stats) {
+ atomic_long_set(&c->sectors_written, 0);
+ atomic_long_set(&c->btree_sectors_written, 0);
+ atomic_long_set(&c->meta_sectors_written, 0);
+ atomic_set(&c->io_count, 0);
+ atomic_set(&c->io_errors, 0);
+ }
+
+ return size;
+}
+STORE_LOCKED(cache)
+
+static void cache_kobject_init(struct cache *ca)
+{
+ static struct attribute *cache_files[] = {
+ &sysfs_bucket_size,
+ &sysfs_block_size,
+ &sysfs_nbuckets,
+ &sysfs_priority_stats,
+ &sysfs_discard,
+ &sysfs_written,
+ &sysfs_btree_written,
+ &sysfs_metadata_written,
+ &sysfs_io_errors,
+ &sysfs_clear_stats,
+ &sysfs_freelist_percent,
+ &sysfs_cache_replacement_policy,
+ NULL
+ };
+ KTYPE(cache, cache_free);
+
+ kobject_init(&ca->kobj, &cache_obj);
+}
diff --git a/drivers/block/bcache/sysfs.h b/drivers/block/bcache/sysfs.h
new file mode 100644
index 0000000..214699f
--- /dev/null
+++ b/drivers/block/bcache/sysfs.h
@@ -0,0 +1,99 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#define KTYPE(type, _release) \
+static const struct sysfs_ops type ## _ops = { \
+ .show = type ## _show, \
+ .store = type ## _store \
+}; \
+static struct kobj_type type ## _obj = { \
+ .release = _release, \
+ .sysfs_ops = &type ## _ops, \
+ .default_attrs = type ## _files \
+}
+
+#define SHOW(fn) \
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+ char *buf) \
+
+#define STORE(fn) \
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+ const char *buf, size_t size) \
+
+#define SHOW_LOCKED(fn) \
+SHOW(fn) \
+{ \
+ ssize_t ret; \
+ mutex_lock(®ister_lock); \
+ ret = __ ## fn ## _show(kobj, attr, buf); \
+ mutex_unlock(®ister_lock); \
+ return ret; \
+}
+
+#define STORE_LOCKED(fn) \
+STORE(fn) \
+{ \
+ ssize_t ret; \
+ mutex_lock(®ister_lock); \
+ ret = __ ## fn ## _store(kobj, attr, buf, size); \
+ mutex_unlock(®ister_lock); \
+ return ret; \
+}
+
+#define __sysfs_attribute(_name, _mode) \
+ static struct attribute sysfs_##_name = \
+ { .name = #_name, .mode = _mode }
+
+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...) \
+ if (attr == &sysfs_ ## file) \
+ return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__)
+
+#define sysfs_print(file, var) \
+ if (attr == &sysfs_ ## file) \
+ return snprint(buf, PAGE_SIZE, var)
+
+#define sysfs_hprint(file, val) \
+ if (attr == &sysfs_ ## file) { \
+ ssize_t ret = hprint(buf, val); \
+ strcat(buf, "\n"); \
+ return ret + 1; \
+ }
+
+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var) sysfs_print(_var, var(_var))
+#define var_hprint(_var) sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var) \
+ if (attr == &sysfs_ ## file) \
+ return strtoul_safe(buf, var) ?: (ssize_t) size;
+
+#define sysfs_strtoul_clamp(file, var, min, max) \
+ if (attr == &sysfs_ ## file) \
+ return strtoul_safe_clamp(buf, var, min, max) \
+ ?: (ssize_t) size;
+
+#define strtoul_or_return(cp) \
+({ \
+ unsigned long _v; \
+ int _r = strict_strtoul(cp, 10, &_v); \
+ if (_r) \
+ return _r; \
+ _v; \
+})
+
+#define strtoi_h_or_return(cp, v) \
+do { \
+ int _r = strtoi_h(cp, &v); \
+ if (_r) \
+ return _r; \
+} while (0)
+
+#define sysfs_hatoi(file, var) \
+ if (attr == &sysfs_ ## file) \
+ return strtoi_h(buf, &var) ?: (ssize_t) size;
+
+#endif /* _BCACHE_SYSFS_H_ */
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 11/16] bcache: Core btree code
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (5 preceding siblings ...)
2012-05-10 3:10 ` [Bcache v13 10/16] bcache: Superblock/initialization/sysfs code Kent Overstreet
@ 2012-05-10 3:10 ` Kent Overstreet
[not found] ` <7f1de39b6d7040b3fe271500776f4b985b21ea82.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:11 ` [Bcache v13 12/16] bcache: Bset code (lookups within a btree node) Kent Overstreet
` (6 subsequent siblings)
13 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:10 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/bcache.h | 839 +++++++++++++++
drivers/block/bcache/btree.c | 2249 +++++++++++++++++++++++++++++++++++++++++
drivers/block/bcache/btree.h | 272 +++++
3 files changed, 3360 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/bcache.h
create mode 100644 drivers/block/bcache/btree.c
create mode 100644 drivers/block/bcache/btree.h
diff --git a/drivers/block/bcache/bcache.h b/drivers/block/bcache/bcache.h
new file mode 100644
index 0000000..aad9c48
--- /dev/null
+++ b/drivers/block/bcache/bcache.h
@@ -0,0 +1,839 @@
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "util.h"
+
+struct bucket {
+ atomic_t pin;
+ uint16_t prio;
+ uint8_t gen;
+ uint8_t disk_gen;
+ uint8_t last_gc; /* Most out of date gen in the btree */
+ uint8_t gc_gen;
+
+#define GC_MARK_DIRTY -1
+#define GC_MARK_BTREE -2
+ short mark;
+};
+
+struct bkey {
+ uint64_t header;
+ uint64_t key;
+ uint64_t ptr[];
+};
+
+#define BKEY_PADDED(key) \
+ union { struct bkey key; uint64_t key ## _pad[8]; }
+
+/* Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: New UUID format
+ */
+#define BCACHE_SB_VERSION 3
+
+#define SB_SECTOR 8
+#define SB_SIZE 4096
+#define SB_LABEL_SIZE 32
+#define SB_JOURNAL_BUCKETS 256
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET 8
+
+struct cache_sb {
+ uint64_t csum;
+ uint64_t offset; /* sector where this sb was written */
+ uint64_t version;
+#define CACHE_BACKING_DEV 1
+
+ uint8_t magic[16];
+
+ uint8_t uuid[16];
+ union {
+ uint8_t set_uuid[16];
+ uint64_t set_magic;
+ };
+ uint8_t label[SB_LABEL_SIZE];
+
+ uint64_t flags;
+ uint64_t seq;
+ uint64_t pad[8];
+
+ uint64_t nbuckets; /* device size */
+ uint16_t block_size; /* sectors */
+ uint16_t bucket_size; /* sectors */
+
+ uint16_t nr_in_set;
+ uint16_t nr_this_dev;
+
+ uint32_t last_mount; /* time_t */
+
+ uint16_t first_bucket;
+ union {
+ uint16_t njournal_buckets;
+ uint16_t keys;
+ };
+ uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
+};
+
+BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU 0U
+#define CACHE_REPLACEMENT_FIFO 1U
+#define CACHE_REPLACEMENT_RANDOM 2U
+
+BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH 0U
+#define CACHE_MODE_WRITEBACK 1U
+#define CACHE_MODE_WRITEAROUND 2U
+#define CACHE_MODE_NONE 3U
+BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE 0U
+#define BDEV_STATE_CLEAN 1U
+#define BDEV_STATE_DIRTY 2U
+#define BDEV_STATE_STALE 3U
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_VERSION 1
+
+/*
+ * This is the on disk format for btree nodes - a btree node on disk is a list
+ * of these; within each set the keys are sorted
+ */
+struct bset {
+ uint64_t csum;
+ uint64_t magic;
+ uint64_t seq;
+ uint32_t version;
+ uint32_t keys;
+
+ union {
+ struct bkey start[0];
+ uint64_t d[0];
+ };
+};
+
+/*
+ * On disk format for priorities and gens - see super.c near prio_write() for
+ * more.
+ */
+struct prio_set {
+ uint64_t csum;
+ uint64_t magic;
+ uint64_t seq;
+ uint32_t version;
+ uint32_t pad;
+
+ uint64_t next_bucket;
+
+ struct bucket_disk {
+ uint16_t prio;
+ uint8_t gen;
+ } __attribute((packed)) data[];
+};
+
+#include "journal.h"
+#include "stats.h"
+struct search;
+struct btree;
+
+struct bcache_device {
+ struct closure cl;
+
+ struct kobject kobj;
+
+ struct cache_set *c;
+ unsigned id;
+#define BCACHEDEVNAME_SIZE 12
+ char name[BCACHEDEVNAME_SIZE];
+
+ struct gendisk *disk;
+
+ /* If nonzero, we're closing */
+ atomic_t closing;
+
+ /* If nonzero, we're detaching/unregistering from cache set */
+ atomic_t detaching;
+
+ atomic_long_t sectors_dirty;
+ unsigned long sectors_dirty_gc;
+ unsigned long sectors_dirty_last;
+ long sectors_dirty_derivative;
+
+ mempool_t *unaligned_bvec;
+ struct bio_set *bio_split;
+
+ unsigned data_csum:1;
+
+ int (*cache_miss)(struct btree *, struct search *, struct bio *, unsigned);
+ int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
+};
+
+struct io {
+ /* Used to track sequential IO so it can be skipped */
+ struct hlist_node hash;
+ struct list_head lru;
+
+ unsigned long jiffies;
+ unsigned sequential;
+ sector_t last;
+};
+
+struct dirty_io {
+ struct closure cl;
+ struct cached_dev *d;
+ struct bio bio;
+};
+
+struct dirty {
+ struct rb_node node;
+ BKEY_PADDED(key);
+ struct dirty_io *io;
+};
+
+struct cached_dev {
+ struct list_head list;
+ struct bcache_device disk;
+ struct block_device *bdev;
+
+ struct cache_sb sb;
+ struct bio sb_bio;
+ struct bio_vec sb_bv[1];
+ struct closure_with_waitlist sb_write;
+
+ /* Refcount on the cache set. Always nonzero when we're caching. */
+ atomic_t count;
+ struct work_struct detach;
+
+ /*
+ * Device might not be running if it's dirty and the cache set hasn't
+ * showed up yet.
+ */
+ atomic_t running;
+
+ mempool_t *bio_passthrough;
+
+ /*
+ * Writes take a shared lock from start to finish; scanning for dirty
+ * data to refill the rb tree requires an exclusive lock.
+ */
+ struct rw_semaphore writeback_lock;
+
+ /*
+ * Beginning and end of range in dirty rb tree - so that we can skip
+ * taking dirty_lock and checking the rb tree. Protected by
+ * writeback_lock.
+ */
+ sector_t writeback_start;
+ sector_t writeback_end;
+
+ struct rb_root dirty;
+ spinlock_t dirty_lock;
+
+ /*
+ * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+ * data in the cache. Protected by writeback_lock; must have an
+ * shared lock to set and exclusive lock to clear.
+ */
+ atomic_t has_dirty;
+
+ uint64_t next_writeback_io;
+ struct delayed_work writeback_rate_update;
+
+ /*
+ * Internal to the writeback code, so refill_dirty() and read_dirty()
+ * can keep track of where they're at.
+ */
+ sector_t last_found;
+ sector_t last_read;
+
+ /* Number of writeback bios in flight */
+ atomic_t in_flight;
+ struct delayed_work refill_dirty;
+ struct delayed_work read_dirty;
+
+#define WRITEBACK_SLURP 100
+ DECLARE_ARRAY_ALLOCATOR(struct dirty, dirty_freelist, WRITEBACK_SLURP);
+
+ /* For tracking sequential IO */
+#define RECENT_IO_BITS 7
+#define RECENT_IO (1 << RECENT_IO_BITS)
+ struct io io[RECENT_IO];
+ struct hlist_head io_hash[RECENT_IO + 1];
+ struct list_head io_lru;
+ spinlock_t io_lock;
+
+ struct cache_accounting accounting;
+
+ /* The rest of this all shows up in sysfs */
+ unsigned sequential_cutoff;
+ unsigned readahead;
+
+ unsigned sequential_merge:1;
+ unsigned verify:1;
+
+ unsigned writeback_metadata:1;
+ unsigned writeback_running:1;
+ unsigned char writeback_percent;
+ unsigned writeback_delay;
+
+ unsigned writeback_rate;
+ int writeback_rate_change;
+ int64_t writeback_rate_derivative;
+ uint64_t writeback_rate_target;
+
+ unsigned writeback_rate_update_seconds;
+ unsigned writeback_rate_d_term;
+ unsigned writeback_rate_p_term_inverse;
+ unsigned writeback_rate_d_smooth;
+};
+
+struct cache {
+ struct cache_set *set;
+ struct cache_sb sb;
+ struct bio sb_bio;
+ struct bio_vec sb_bv[1];
+
+ struct kobject kobj;
+ struct block_device *bdev;
+
+ /* XXX: move to cache_set */
+ struct dentry *debug;
+
+ /* XXX: replace with bios allocated from bio_meta mempool */
+ struct bio *uuid_bio;
+
+ struct closure prio;
+ /* XXX: replace with bios allocated from bio_meta mempool */
+ struct bio *prio_bio;
+ struct prio_set *disk_buckets;
+
+ /*
+ * When allocating new buckets, prio_write() gets first dibs - since we
+ * may not be allocate at all without writing priorities and gens.
+ * prio_buckets[] contains the last buckets we wrote priorities to (so
+ * gc can mark them as metadata), prio_next[] contains the buckets
+ * allocated for the next prio write.
+ */
+ uint64_t *prio_buckets;
+ uint64_t *prio_next;
+ unsigned prio_write;
+ unsigned prio_alloc;
+
+ /* > 0: buckets in free_inc have been marked as free
+ * = 0: buckets in free_inc can't be used until priorities are written
+ * < 0: priority write in progress
+ */
+ atomic_t prio_written;
+
+ /* Allocation stuff: */
+ struct bucket *buckets;
+
+ DECLARE_HEAP(struct bucket *, heap);
+
+ /*
+ * max(gen - disk_gen) for all buckets. When it gets too big we have to
+ * call prio_write() to keep gens from wrapping.
+ */
+ uint8_t need_save_prio;
+
+ /*
+ * If nonzero, we know we aren't going to find any buckets to invalidate
+ * until a gc finishes - otherwise we could pointlessly burn a ton of
+ * cpu
+ */
+ unsigned invalidate_needs_gc:1;
+
+ size_t fifo_last_bucket;
+
+ DECLARE_FIFO(long, free);
+ DECLARE_FIFO(long, free_inc);
+ DECLARE_FIFO(long, unused);
+
+ bool discard; /* Get rid of? */
+ struct list_head discards;
+ struct page *discard_page;
+
+ struct journal_device journal;
+
+ /* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT 20
+ atomic_t io_errors;
+ atomic_t io_count;
+
+ atomic_long_t meta_sectors_written;
+ atomic_long_t btree_sectors_written;
+ atomic_long_t sectors_written;
+};
+
+struct gc_stat {
+ size_t nodes;
+ size_t key_bytes;
+
+ size_t nkeys;
+ uint64_t data; /* sectors */
+ uint64_t dirty; /* sectors */
+ unsigned in_use; /* percent */
+};
+
+struct cache_set {
+ struct closure cl;
+
+ struct list_head list;
+ struct kobject kobj;
+ struct kobject internal;
+ struct cache_accounting accounting;
+
+ /*
+ * If nonzero, we're trying to detach from all the devices we're
+ * caching; otherwise we're merely closing
+ */
+ atomic_t unregistering;
+ atomic_t closing;
+
+ struct cache_sb sb;
+
+ struct cache *cache[MAX_CACHES_PER_SET];
+ struct cache *cache_by_alloc[MAX_CACHES_PER_SET];
+ int caches_loaded;
+
+ struct bcache_device **devices;
+ struct list_head cached_devs;
+ uint64_t cached_dev_sectors;
+ struct closure caching;
+
+ struct closure_with_waitlist sb_write;
+
+ mempool_t *search;
+ mempool_t *bio_meta;
+ struct bio_set *bio_split;
+
+ /* For the btree cache */
+ struct shrinker shrink;
+
+ /* For the btree cache and anything allocation related */
+ struct mutex bucket_lock;
+
+ /* log2(bucket_size), in sectors */
+ unsigned short bucket_bits;
+
+ /* log2(block_size), in sectors */
+ unsigned short block_bits;
+
+ /*
+ * Default number of pages for a new btree node - may be less than a
+ * full bucket
+ */
+ unsigned btree_pages;
+
+ /*
+ * Lists of struct btrees; lru is the list for structs that have memory
+ * allocated for actual btree node, freed is for structs that do not.
+ */
+ struct list_head btree_cache;
+ struct list_head btree_cache_freeable;
+ struct list_head btree_cache_freed;
+
+ /* Number of elements in btree_cache + btree_cache_freeable lists */
+ unsigned bucket_cache_used;
+
+ /*
+ * If we need to allocate memory for a new btree node and that
+ * allocation fails, we can cannibalize another node in the btree cache
+ * to satisfy the allocation. However, only one thread can be doing this
+ * at a time, for obvious reasons - try_harder and try_wait are
+ * basically a lock for this that we can wait on asynchronously. The
+ * btree_root() macro releases the lock when it returns.
+ */
+ struct closure *try_harder;
+ closure_list_t try_wait;
+ uint64_t try_harder_start;
+
+ /*
+ * When we free a btree node, we increment the gen of the bucket the
+ * node is in - but we can't rewrite the prios and gens until we
+ * finished whatever it is we were doing, otherwise after a crash the
+ * btree node would be freed but for say a split, we might not have the
+ * pointers to the new nodes inserted into the btree yet.
+ *
+ * This is a refcount that blocks prio_write() until the new keys are
+ * written.
+ */
+ atomic_t prio_blocked;
+ closure_list_t bucket_wait;
+
+ /*
+ * For any bio we don't skip we subtract the number of sectors from
+ * rescale; when it hits 0 we rescale all the bucket priorities.
+ */
+ atomic_t rescale;
+ /*
+ * When we invalidate buckets, we use both the priority and the amount
+ * of good data to determine which buckets to reuse first - to weight
+ * those together consistently we keep track of the smallest nonzero
+ * priority of any bucket.
+ */
+ uint16_t min_prio;
+
+ /*
+ * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+ * to keep gens from wrapping around.
+ */
+ uint8_t need_gc;
+ struct gc_stat gc_stats;
+ size_t nbuckets;
+
+ struct closure_with_waitlist gc;
+ /* Where in the btree gc currently is */
+ struct bkey gc_done;
+
+ /*
+ * The allocation code needs gc_mark in struct bucket to be correct, but
+ * it's not while a gc is in progress. Protected by bucket_lock.
+ */
+ int gc_mark_valid;
+
+ /* Counts how many sectors bio_insert has added to the cache */
+ atomic_t sectors_to_gc;
+
+ struct btree *root;
+
+#ifdef CONFIG_BCACHE_DEBUG
+ struct btree *verify_data;
+ struct mutex verify_lock;
+#endif
+
+ unsigned nr_uuids;
+ struct uuid_entry *uuids;
+ BKEY_PADDED(uuid_bucket);
+ struct closure_with_waitlist uuid_write;
+
+ /*
+ * A btree node on disk could have too many bsets for an iterator to fit
+ * on the stack - this is a single element mempool for btree_read_work()
+ */
+ struct mutex fill_lock;
+ struct btree_iter *fill_iter;
+
+ /*
+ * btree_sort() is a merge sort and requires temporary space - single
+ * element mempool
+ */
+ struct mutex sort_lock;
+ struct bset *sort;
+
+ /* List of buckets we're currently writing data to */
+ struct list_head data_buckets;
+ spinlock_t data_bucket_lock;
+
+ struct journal journal;
+
+#define CONGESTED_MAX 1024
+ unsigned congested_last_us;
+ atomic_t congested;
+
+ /* The rest of this all shows up in sysfs */
+ unsigned congested_read_threshold_us;
+ unsigned congested_write_threshold_us;
+
+ spinlock_t sort_time_lock;
+ struct time_stats sort_time;
+ struct time_stats btree_gc_time;
+ struct time_stats btree_split_time;
+ spinlock_t btree_read_time_lock;
+ struct time_stats btree_read_time;
+ struct time_stats try_harder_time;
+
+ atomic_long_t cache_read_races;
+ atomic_long_t writeback_keys_done;
+ atomic_long_t writeback_keys_failed;
+ unsigned error_limit;
+ unsigned error_decay;
+ unsigned short journal_delay_ms;
+ unsigned verify:1;
+ unsigned key_merging_disabled:1;
+ unsigned gc_always_rewrite:1;
+ unsigned shrinker_disabled:1;
+
+#define BUCKET_HASH_BITS 12
+ struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
+};
+
+static inline bool key_merging_disabled(struct cache_set *c)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+ return c->key_merging_disabled;
+#else
+ return 0;
+#endif
+}
+
+struct bbio {
+ unsigned submit_time_us;
+ union {
+ struct bkey key;
+ uint64_t _pad[3];
+ };
+ struct bio bio;
+};
+
+static inline unsigned local_clock_us(void)
+{
+ return local_clock() >> 10;
+}
+
+#define MAX_BSETS 4
+
+#define btree_prio USHRT_MAX
+#define initial_prio 32768
+
+#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
+#define btree_blocks(b) \
+ ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
+
+#define btree_default_blocks(c) \
+ ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
+
+#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS)
+#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
+#define block_bytes(c) ((c)->sb.block_size << 9)
+
+#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i) __set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
+#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
+
+#define node(i, j) ((struct bkey *) ((i)->d + (j)))
+#define end(i) node(i, (i)->keys)
+
+#define index(i, b) \
+ ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
+ block_bytes(b->c)))
+
+#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
+
+#define prios_per_bucket(c) \
+ ((bucket_bytes(c) - sizeof(struct prio_set)) / \
+ sizeof(struct bucket_disk))
+#define prio_buckets(c) \
+ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+#define JSET_MAGIC 0x245235c1a3625032
+#define PSET_MAGIC 0x6750e15f87337f91
+#define BSET_MAGIC 0x90135c78b99e07f5
+
+#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
+#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
+#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
+
+/* Bkey fields: all units are in sectors */
+
+#define KEY_FIELD(name, field, offset, size) \
+ BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size) \
+ static inline uint64_t name(const struct bkey *k, unsigned i) \
+ { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
+ \
+ static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
+ { \
+ k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
+ k->ptr[i] |= v << offset; \
+ }
+
+KEY_FIELD(KEY_PTRS, header, 60, 3)
+KEY_FIELD(HEADER_SIZE, header, 58, 2)
+KEY_FIELD(KEY_CSUM, header, 56, 2)
+KEY_FIELD(KEY_PINNED, header, 55, 1)
+KEY_FIELD(KEY_DIRTY, header, 36, 1)
+
+KEY_FIELD(KEY_SIZE, header, 20, 16)
+KEY_FIELD(KEY_DEV, header, 0, 20)
+
+KEY_FIELD(KEY_SECTOR, key, 16, 47)
+KEY_FIELD(KEY_SNAPSHOT, key, 0, 16)
+
+PTR_FIELD(PTR_DEV, 51, 12)
+PTR_FIELD(PTR_OFFSET, 8, 43)
+PTR_FIELD(PTR_GEN, 0, 8)
+
+#define PTR_CHECK_DEV ((1 << 12) - 1)
+
+#define PTR(gen, offset, dev) \
+ ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
+
+#define sector_to_bucket(c, s) ((long) ((s) >> (c)->bucket_bits))
+#define bucket_to_sector(c, b) (((sector_t) (b)) << (c)->bucket_bits)
+#define bucket_remainder(c, b) ((b) & ((c)->sb.bucket_size - 1))
+
+#define PTR_CACHE(c, k, n) ((c)->cache[PTR_DEV(k, n)])
+#define PTR_BUCKET_NR(c, k, n) sector_to_bucket(c, PTR_OFFSET(k, n))
+
+#define PTR_BUCKET(c, k, n) \
+ (PTR_CACHE(c, k, n)->buckets + PTR_BUCKET_NR(c, k, n))
+
+/* Btree key macros */
+
+#define KEY_HEADER(len, dev) \
+ (((uint64_t) 1 << 63) | ((uint64_t) (len) << 20) | (dev))
+
+#define KEY(dev, sector, len) (struct bkey) \
+ { .header = KEY_HEADER(len, dev), .key = (sector) }
+
+#define KEY_START(k) ((k)->key - KEY_SIZE(k))
+#define START_KEY(k) KEY(KEY_DEV(k), KEY_START(k), 0)
+#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
+#define ZERO_KEY KEY(0, 0, 0)
+
+#define csum_set(i) \
+ crc64(((void *) (i)) + 8, ((void *) end(i)) - (((void *) (i)) + 8))
+
+/* Error handling macros */
+
+#define btree_bug(b, ...) \
+ ({ if (cache_set_error((b)->c, __VA_ARGS__)) dump_stack(); })
+
+#define cache_bug(c, ...) \
+ ({ if (cache_set_error(c, __VA_ARGS__)) dump_stack(); })
+
+#define btree_bug_on(cond, b, ...) \
+ ({ if (cond) btree_bug(b, __VA_ARGS__); })
+
+#define cache_bug_on(cond, c, ...) \
+ ({ if (cond) cache_bug(c, __VA_ARGS__); })
+
+#define cache_set_err_on(cond, c, ...) \
+ ({ if (cond) cache_set_error(c, __VA_ARGS__); })
+
+/* Looping macros */
+
+#define for_each_cache(ca, cs) \
+ for (int _i = 0; ca = cs->cache[_i], _i < (cs)->sb.nr_in_set; _i++)
+
+#define for_each_bucket(b, ca) \
+ for (b = (ca)->buckets + (ca)->sb.first_bucket; \
+ b < (ca)->buckets + (ca)->sb.nbuckets; b++)
+
+static inline void __bkey_put(struct cache_set *c, struct bkey *k)
+{
+ for (unsigned i = 0; i < KEY_PTRS(k); i++)
+ atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
+/* Blktrace macros */
+
+#define blktrace_msg(c, fmt, ...) \
+do { \
+ struct request_queue *q = bdev_get_queue(c->bdev); \
+ if (q) \
+ blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define blktrace_msg_all(s, fmt, ...) \
+do { \
+ struct cache *_c; \
+ for_each_cache(_c, (s)) \
+ blktrace_msg(_c, fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define err_printk(...) printk(KERN_ERR "bcache: " __VA_ARGS__)
+
+static inline void cached_dev_put(struct cached_dev *d)
+{
+ if (atomic_dec_and_test(&d->count))
+ schedule_work(&d->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *d)
+{
+ if (!atomic_inc_not_zero(&d->count))
+ return false;
+
+ smp_mb__after_atomic_inc();
+ return true;
+}
+
+#define bucket_gc_gen(b) ((uint8_t) ((b)->gen - (b)->last_gc))
+#define bucket_disk_gen(b) ((uint8_t) ((b)->gen - (b)->disk_gen))
+
+#define kobj_attribute_write(n, fn) \
+ static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store) \
+ static struct kobj_attribute ksysfs_##n = \
+ __ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+#define bio_split_get(bio, len, c) \
+ __bio_split_get(bio, len, (c)->bio_split)
+
+/* Forward declarations */
+
+bool bcache_in_writeback(struct cached_dev *, sector_t, unsigned);
+void bcache_writeback_queue(struct cached_dev *);
+void bcache_writeback_add(struct cached_dev *, unsigned);
+
+void count_io_errors(struct cache *, int, const char *);
+void bcache_endio(struct cache_set *, struct bio *, int, const char *);
+void bbio_free(struct bio *, struct cache_set *);
+struct bio *bbio_alloc(struct cache_set *);
+struct bio *bbio_kmalloc(gfp_t, int);
+struct bio *__bio_split_get(struct bio *, int, struct bio_set *);
+
+void __submit_bbio(struct bio *, struct cache_set *);
+void submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
+int submit_bbio_split(struct bio *, struct cache_set *,
+ struct bkey *, unsigned);
+
+void cache_read_endio(struct bio *, int);
+
+struct bcache_cgroup;
+struct cgroup;
+struct bcache_cgroup *cgroup_to_bcache(struct cgroup *cgroup);
+struct bcache_cgroup *bio_to_cgroup(struct bio *bio);
+
+uint8_t inc_gen(struct cache *, struct bucket *);
+void rescale_priorities(struct cache_set *, int);
+bool bucket_add_unused(struct cache *, struct bucket *);
+bool can_save_prios(struct cache *);
+void free_some_buckets(struct cache *);
+void unpop_bucket(struct cache_set *, struct bkey *);
+int __pop_bucket_set(struct cache_set *, uint16_t,
+ struct bkey *, int, struct closure *);
+int pop_bucket_set(struct cache_set *, uint16_t,
+ struct bkey *, int, struct closure *);
+
+bool cache_set_error(struct cache_set *, const char *, ...);
+
+void prio_write(struct cache *);
+void write_bdev_super(struct cached_dev *, struct closure *);
+
+extern struct workqueue_struct *bcache_wq;
+extern const char * const bcache_cache_modes[];
+
+struct cache_set *cache_set_alloc(struct cache_sb *);
+void free_discards(struct cache *);
+int alloc_discards(struct cache *);
+void bcache_btree_cache_free(struct cache_set *);
+int bcache_btree_cache_alloc(struct cache_set *);
+void bcache_writeback_init_cached_dev(struct cached_dev *);
+
+void bcache_debug_exit(void);
+int bcache_debug_init(struct kobject *);
+void bcache_writeback_exit(void);
+int bcache_writeback_init(void);
+void bcache_request_exit(void);
+int bcache_request_init(void);
+void bcache_btree_exit(void);
+int bcache_btree_init(void);
diff --git a/drivers/block/bcache/btree.c b/drivers/block/bcache/btree.c
new file mode 100644
index 0000000..7e9975f
--- /dev/null
+++ b/drivers/block/bcache/btree.c
@@ -0,0 +1,2249 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+/*
+ * Todo:
+ * register_bcache: Return errors out to userspace correctly
+ *
+ * Writeback: don't undirty key until after a cache flush
+ *
+ * Create an iterator for key pointers
+ *
+ * On btree write error, mark bucket such that it won't be freed from the cache
+ *
+ * Journalling:
+ * Check for bad keys in replay
+ * Propagate barriers
+ * Refcount journal entries in journal_replay
+ *
+ * Garbage collection:
+ * Finish incremental gc
+ * Gc should free old UUIDs, data for invalid UUIDs
+ *
+ * Provide a way to list backing device UUIDs we have data cached for, and
+ * probably how long it's been since we've seen them, and a way to invalidate
+ * dirty data for devices that will never be attached again
+ *
+ * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
+ * that based on that and how much dirty data we have we can keep writeback
+ * from being starved
+ *
+ * Add a tracepoint or somesuch to watch for writeback starvation
+ *
+ * When btree depth > 1 and splitting an interior node, we have to make sure
+ * alloc_bucket() cannot fail. This should be true but is not completely
+ * obvious.
+ *
+ * Make sure all allocations get charged to the root cgroup
+ *
+ * Plugging?
+ *
+ * If data write is less than hard sector size of ssd, round up offset in open
+ * bucket to the next whole sector
+ *
+ * Also lookup by cgroup in get_open_bucket()
+ *
+ * Superblock needs to be fleshed out for multiple cache devices
+ *
+ * Add a sysfs tunable for the number of writeback IOs in flight
+ *
+ * Add a sysfs tunable for the number of open data buckets
+ *
+ * IO tracking: Can we track when one process is doing io on behalf of another?
+ * IO tracking: Don't use just an average, weigh more recent stuff higher
+ *
+ * Test module load/unload
+ */
+
+static const char * const op_types[] = {
+ "insert", "replace"
+};
+
+static const char *op_type(struct btree_op *op)
+{
+ return op_types[op->type];
+}
+
+#define MAX_NEED_GC 64
+#define MAX_SAVE_PRIO 72
+
+#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
+
+#define PTR_HASH(c, k) \
+ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+
+static struct workqueue_struct *btree_wq;
+
+void btree_op_init_stack(struct btree_op *op)
+{
+ memset(op, 0, sizeof(struct btree_op));
+ closure_init_stack(&op->cl);
+ op->lock = -1;
+ keylist_init(&op->keys);
+}
+
+/* Btree key manipulation */
+
+static void bkey_put(struct cache_set *c, struct bkey *k, int level)
+{
+ if ((level && k->key) || !level)
+ __bkey_put(c, k);
+}
+
+/* Btree IO */
+
+static uint64_t btree_csum_set(struct btree *b, struct bset *i)
+{
+ uint64_t crc = b->key.ptr[0];
+ void *data = (void *) i + 8, *end = end(i);
+
+ crc = crc64_update(crc, data, end - data);
+ return crc ^ 0xffffffffffffffff;
+}
+
+static void btree_bio_endio(struct bio *bio, int error)
+{
+ struct btree *b = container_of(bio->bi_private, struct btree, io.cl);
+
+ if (error)
+ set_btree_node_io_error(b);
+
+ bcache_endio(b->c, bio, error, (bio->bi_rw & WRITE)
+ ? "writing btree" : "reading btree");
+}
+
+static void btree_bio_init(struct btree *b)
+{
+ BUG_ON(b->bio);
+ b->bio = bbio_alloc(b->c);
+
+ bio_get(b->bio);
+ b->bio->bi_end_io = btree_bio_endio;
+ b->bio->bi_private = &b->io.cl;
+}
+
+void btree_read_done(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io.cl);
+ struct bset *i = b->sets[0].data;
+ struct btree_iter *iter = b->c->fill_iter;
+ const char *err = "bad btree header";
+ BUG_ON(b->nsets || b->written);
+
+ bbio_free(b->bio, b->c);
+ b->bio = NULL;
+
+ mutex_lock(&b->c->fill_lock);
+ iter->used = 0;
+
+ if (btree_node_io_error(b) ||
+ !i->seq)
+ goto err;
+
+ for (;
+ b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+ i = write_block(b)) {
+ err = "unsupported bset version";
+ if (i->version > BCACHE_BSET_VERSION)
+ goto err;
+
+ err = "bad btree header";
+ if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+ goto err;
+
+ err = "bad magic";
+ if (i->magic != bset_magic(b->c))
+ goto err;
+
+ err = "bad checksum";
+ switch (i->version) {
+ case 0:
+ if (i->csum != csum_set(i))
+ goto err;
+ break;
+ case BCACHE_BSET_VERSION:
+ if (i->csum != btree_csum_set(b, i))
+ goto err;
+ break;
+ }
+
+ err = "empty set";
+ if (i != b->sets[0].data && !i->keys)
+ goto err;
+
+ btree_iter_push(iter, i->start, end(i));
+
+ b->written += set_blocks(i, b->c);
+ }
+
+ err = "corrupted btree";
+ for (i = write_block(b);
+ index(i, b) < btree_blocks(b);
+ i = ((void *) i) + block_bytes(b->c))
+ if (i->seq == b->sets[0].data->seq)
+ goto err;
+
+ btree_sort_and_fix_extents(b, iter);
+
+ i = b->sets[0].data;
+ err = "short btree key";
+ if (b->sets[0].size &&
+ bkey_cmp(&b->key, &b->sets[0].end) < 0)
+ goto err;
+
+ if (b->written < btree_blocks(b))
+ bset_init_next(b);
+
+ if (0) {
+err: set_btree_node_io_error(b);
+ cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
+ err, PTR_BUCKET_NR(b->c, &b->key, 0),
+ index(i, b), i->keys);
+ }
+
+ mutex_unlock(&b->c->fill_lock);
+
+ spin_lock(&b->c->btree_read_time_lock);
+ time_stats_update(&b->c->btree_read_time, b->io_start_time);
+ spin_unlock(&b->c->btree_read_time_lock);
+
+ smp_wmb(); /* read_done is our write lock */
+ set_btree_node_read_done(b);
+
+ closure_return(cl);
+}
+
+static void btree_read_resubmit(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io.cl);
+
+ submit_bbio_split(b->bio, b->c, &b->key, 0);
+ continue_at(&b->io.cl, btree_read_done, system_wq);
+}
+
+void btree_read(struct btree *b)
+{
+ BUG_ON(b->nsets || b->written);
+
+ if (!closure_trylock(&b->io.cl, &b->c->cl))
+ BUG();
+
+ b->io_start_time = local_clock();
+
+ btree_bio_init(b);
+ b->bio->bi_rw = REQ_META|READ_SYNC;
+ b->bio->bi_size = KEY_SIZE(&b->key) << 9;
+
+ bio_map(b->bio, b->sets[0].data);
+
+ pr_debug("%s", pbtree(b));
+ trace_bcache_btree_read(b->bio);
+
+ if (submit_bbio_split(b->bio, b->c, &b->key, 0))
+ continue_at(&b->io.cl, btree_read_resubmit, system_wq);
+
+ continue_at(&b->io.cl, btree_read_done, system_wq);
+}
+
+static void btree_complete_write(struct btree *b, struct btree_write *w)
+{
+ if (w->prio_blocked &&
+ !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
+ closure_wake_up(&b->c->bucket_wait);
+
+ if (w->journal) {
+ atomic_dec_bug(w->journal);
+ __closure_wake_up(&b->c->journal.wait);
+ }
+
+ if (w->owner)
+ closure_put(w->owner);
+
+ w->prio_blocked = 0;
+ w->journal = NULL;
+ w->owner = NULL;
+}
+
+static void __btree_write_done(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io.cl);
+ struct btree_write *w = btree_prev_write(b);
+
+ bbio_free(b->bio, b->c);
+ b->bio = NULL;
+ btree_complete_write(b, w);
+
+ if (btree_node_dirty(b))
+ queue_delayed_work(btree_wq, &b->work,
+ msecs_to_jiffies(30000));
+
+ closure_return(cl);
+}
+
+static void btree_write_done(struct closure *cl)
+{
+ struct btree *b = container_of(cl, struct btree, io.cl);
+ struct bio_vec *bv;
+ int n;
+
+ __bio_for_each_segment(bv, b->bio, n, 0)
+ __free_page(bv->bv_page);
+
+ __btree_write_done(cl);
+}
+
+static void do_btree_write(struct btree *b)
+{
+ struct closure *cl = &b->io.cl;
+ struct bset *i = b->sets[b->nsets].data;
+ BKEY_PADDED(key) k;
+
+ i->version = BCACHE_BSET_VERSION;
+ i->csum = btree_csum_set(b, i);
+
+ btree_bio_init(b);
+ b->bio->bi_rw = REQ_META|WRITE_SYNC;
+ b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+ bio_map(b->bio, i);
+
+ bkey_copy(&k.key, &b->key);
+ SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+
+ if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+ int j;
+ struct bio_vec *bv;
+ void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+
+ bio_for_each_segment(bv, b->bio, j)
+ memcpy(page_address(bv->bv_page),
+ base + j * PAGE_SIZE, PAGE_SIZE);
+
+ trace_bcache_btree_write(b->bio);
+ submit_bbio_split(b->bio, b->c, &k.key, 0);
+
+ continue_at(cl, btree_write_done, NULL);
+ } else {
+ bio_map(b->bio, i);
+
+ trace_bcache_btree_write(b->bio);
+ submit_bbio_split(b->bio, b->c, &k.key, 0);
+
+ closure_sync(cl);
+ __btree_write_done(cl);
+ }
+}
+
+static void __btree_write(struct btree *b)
+{
+ struct bset *i = b->sets[b->nsets].data;
+
+ BUG_ON(current->bio_list);
+
+ closure_lock(&b->io, &b->c->cl);
+ __cancel_delayed_work(&b->work);
+
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ change_bit(BTREE_NODE_write_idx, &b->flags);
+
+ check_key_order(b, i);
+ BUG_ON(b->written && !i->keys);
+
+ do_btree_write(b);
+
+ pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
+
+ b->written += set_blocks(i, b->c);
+ atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
+ &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+
+ btree_sort_lazy(b);
+
+ if (b->written < btree_blocks(b))
+ bset_init_next(b);
+}
+
+static void btree_write_work(struct work_struct *w)
+{
+ struct btree *b = container_of(to_delayed_work(w), struct btree, work);
+
+ down_write(&b->lock);
+
+ if (btree_node_dirty(b))
+ __btree_write(b);
+ up_write(&b->lock);
+}
+
+void btree_write(struct btree *b, bool now, struct btree_op *op)
+{
+ struct bset *i = b->sets[b->nsets].data;
+ struct btree_write *w = btree_current_write(b);
+
+ BUG_ON(b->written &&
+ (b->written >= btree_blocks(b) ||
+ i->seq != b->sets[0].data->seq ||
+ !i->keys));
+
+ if (!btree_node_dirty(b)) {
+ set_btree_node_dirty(b);
+ queue_delayed_work(btree_wq, &b->work,
+ msecs_to_jiffies(30000));
+ }
+
+ w->prio_blocked += b->prio_blocked;
+ b->prio_blocked = 0;
+
+ if (op && op->journal && !b->level) {
+ if (w->journal &&
+ journal_pin_cmp(b->c, w, op)) {
+ atomic_dec_bug(w->journal);
+ w->journal = NULL;
+ }
+
+ if (!w->journal) {
+ w->journal = op->journal;
+ atomic_inc(w->journal);
+ }
+ }
+
+ if (current->bio_list)
+ return;
+
+ /* Force write if set is too big */
+ if (now ||
+ b->level ||
+ set_bytes(i) > PAGE_SIZE - 48) {
+ if (op && now) {
+ /* Must wait on multiple writes */
+ BUG_ON(w->owner);
+ w->owner = &op->cl;
+ closure_get(&op->cl);
+ }
+
+ __btree_write(b);
+ }
+ BUG_ON(!b->written);
+}
+
+/*
+ * Btree in memory cache - allocation/freeing
+ * mca -> memory cache
+ */
+
+#define mca_reserve(c) ((c->root ? c->root->level : 1) * 8 + 16)
+#define mca_can_free(c) \
+ max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+
+static void mca_data_free(struct btree *b)
+{
+ struct bset_tree *t = b->sets;
+ BUG_ON(!closure_is_unlocked(&b->io.cl));
+
+ if (bset_prev_bytes(b) < PAGE_SIZE)
+ kfree(t->prev);
+ else
+ free_pages((unsigned long) t->prev,
+ get_order(bset_prev_bytes(b)));
+
+ if (bset_tree_bytes(b) < PAGE_SIZE)
+ kfree(t->tree);
+ else
+ free_pages((unsigned long) t->tree,
+ get_order(bset_tree_bytes(b)));
+
+ free_pages((unsigned long) t->data, b->page_order);
+
+ t->prev = NULL;
+ t->tree = NULL;
+ t->data = NULL;
+ list_move(&b->list, &b->c->btree_cache_freed);
+ b->c->bucket_cache_used--;
+}
+
+static void mca_bucket_free(struct btree *b)
+{
+ BUG_ON(btree_node_dirty(b));
+
+ b->key.ptr[0] = 0;
+ hlist_del_init_rcu(&b->hash);
+ list_move(&b->list, &b->c->btree_cache_freeable);
+}
+
+static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
+{
+ struct bset_tree *t = b->sets;
+ BUG_ON(t->data);
+
+ b->page_order = ilog2(max_t(unsigned, b->c->btree_pages,
+ KEY_SIZE(k) / PAGE_SECTORS ?: 1));
+
+ t->data = (void *) __get_free_pages(gfp, b->page_order);
+ if (!t->data)
+ goto err;
+
+ t->tree = bset_tree_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_tree_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+ if (!t->tree)
+ goto err;
+
+ t->prev = bset_prev_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_prev_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+ if (!t->prev)
+ goto err;
+
+ list_move(&b->list, &b->c->btree_cache);
+ b->c->bucket_cache_used++;
+ return;
+err:
+ mca_data_free(b);
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c,
+ struct bkey *k, gfp_t gfp)
+{
+ struct btree *b = kzalloc(sizeof(struct btree), gfp);
+ if (!b)
+ return NULL;
+
+ init_rwsem(&b->lock);
+ INIT_LIST_HEAD(&b->list);
+ INIT_DELAYED_WORK(&b->work, btree_write_work);
+ b->c = c;
+ closure_init_unlocked(&b->io);
+
+ mca_data_alloc(b, k, gfp);
+ return b->sets[0].data ? b : NULL;
+}
+
+static int mca_reap(struct btree *b, struct closure *cl)
+{
+ lockdep_assert_held(&b->c->bucket_lock);
+
+ if (!down_write_trylock(&b->lock))
+ return -1;
+
+ BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+
+ if (cl && btree_node_dirty(b))
+ btree_write(b, true, NULL);
+
+ if (cl)
+ closure_wait_event_async(&b->io.wait, cl,
+ atomic_read(&b->io.cl.remaining) == -1);
+
+ if (btree_node_dirty(b) ||
+ atomic_read(&b->io.cl.remaining) != -1 ||
+ work_pending(&b->work.work)) {
+ rw_unlock(true, b);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static int bcache_shrink_buckets(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+ struct btree *b, *t;
+ unsigned i;
+ int nr, orig_nr = sc->nr_to_scan;
+
+ if (c->shrinker_disabled)
+ return 0;
+
+ /*
+ * If nr == 0, we're supposed to return the number of items we have
+ * cached. Not allowed to return -1.
+ */
+ if (!orig_nr)
+ goto out;
+
+ /* Return -1 if we can't do anything right now */
+ if (!mutex_trylock(&c->bucket_lock))
+ return -1;
+
+ if (c->try_harder) {
+ mutex_unlock(&c->bucket_lock);
+ return -1;
+ }
+
+ if (list_empty(&c->btree_cache)) {
+ /*
+ * Can happen right when we first start up, before we've read in
+ * any btree nodes
+ */
+ mutex_unlock(&c->bucket_lock);
+ return 0;
+ }
+
+ orig_nr /= c->btree_pages;
+ nr = orig_nr = min_t(int, orig_nr, mca_can_free(c));
+
+ i = 0;
+ list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+ if (!nr)
+ break;
+
+ if (++i > 3 &&
+ !mca_reap(b, NULL)) {
+ mca_data_free(b);
+ rw_unlock(true, b);
+ --nr;
+ }
+ }
+
+ for (i = c->bucket_cache_used;
+ i && nr;
+ --i) {
+ b = list_first_entry(&c->btree_cache, struct btree, list);
+ list_rotate_left(&c->btree_cache);
+
+ if (!b->accessed &&
+ !mca_reap(b, NULL)) {
+ mca_bucket_free(b);
+ mca_data_free(b);
+ rw_unlock(true, b);
+ --nr;
+ } else
+ b->accessed = 0;
+ }
+
+ mutex_unlock(&c->bucket_lock);
+out:
+ return mca_can_free(c) * c->btree_pages;
+}
+
+void bcache_btree_cache_free(struct cache_set *c)
+{
+ struct btree *b;
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ if (c->shrink.list.next)
+ unregister_shrinker(&c->shrink);
+
+ mutex_lock(&c->bucket_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+ if (c->verify_data)
+ list_move(&c->verify_data->list, &c->btree_cache);
+#endif
+
+ list_splice(&c->btree_cache_freeable,
+ &c->btree_cache);
+
+ while (!list_empty(&c->btree_cache)) {
+ b = list_first_entry(&c->btree_cache, struct btree, list);
+
+ if (btree_node_dirty(b))
+ btree_complete_write(b, btree_current_write(b));
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+
+ mca_data_free(b);
+ }
+
+ while (!list_empty(&c->btree_cache_freed)) {
+ b = list_first_entry(&c->btree_cache_freed,
+ struct btree, list);
+ list_del(&b->list);
+ cancel_delayed_work_sync(&b->work);
+ kfree(b);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+}
+
+int bcache_btree_cache_alloc(struct cache_set *c)
+{
+ /* XXX: doesn't check for errors */
+
+ closure_init_unlocked(&c->gc);
+
+ for (int i = 0; i < mca_reserve(c); i++)
+ mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+ list_splice_init(&c->btree_cache,
+ &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+ mutex_init(&c->verify_lock);
+
+ c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+ if (c->verify_data &&
+ c->verify_data->sets[0].data)
+ list_del_init(&c->verify_data->list);
+ else
+ c->verify_data = NULL;
+#endif
+
+ c->shrink.shrink = bcache_shrink_buckets;
+ c->shrink.seeks = 3;
+ register_shrinker(&c->shrink);
+
+ return 0;
+}
+
+/* Btree in memory cache - hash table */
+
+static struct hlist_head *hash_bucket(struct cache_set *c, struct bkey *k)
+{
+ return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
+}
+
+static struct btree *find_bucket(struct cache_set *c, struct bkey *k)
+{
+ struct hlist_node *cursor;
+ struct btree *b;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(b, cursor, hash_bucket(c, k), hash)
+ if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
+ goto out;
+ b = NULL;
+out:
+ rcu_read_unlock();
+ return b;
+}
+
+static struct btree *alloc_bucket(struct cache_set *c, struct bkey *k,
+ int level, struct closure *cl)
+{
+ struct btree *b, *i;
+ unsigned page_order = ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
+
+ lockdep_assert_held(&c->bucket_lock);
+retry:
+ if (find_bucket(c, k))
+ return NULL;
+
+ /* btree_free() doesn't free memory; it sticks the node on the end of
+ * the list. Check if there's any freed nodes there:
+ */
+ list_for_each_entry(b, &c->btree_cache_freeable, list)
+ if (page_order <= b->page_order &&
+ !b->key.ptr[0] &&
+ !mca_reap(b, NULL))
+ goto out;
+
+ /* We never free struct btree itself, just the memory that holds the on
+ * disk node. Check the freed list before allocating a new one:
+ */
+ list_for_each_entry(b, &c->btree_cache_freed, list)
+ if (!mca_reap(b, NULL)) {
+ mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
+ if (!b->sets[0].data) {
+ rw_unlock(true, b);
+ goto err;
+ } else
+ goto out;
+ }
+
+ b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
+ if (!b)
+ goto err;
+
+ BUG_ON(!down_write_trylock(&b->lock));
+out:
+ BUG_ON(!closure_is_unlocked(&b->io.cl));
+
+ bkey_copy(&b->key, k);
+ list_move(&b->list, &c->btree_cache);
+ hlist_del_init_rcu(&b->hash);
+ hlist_add_head_rcu(&b->hash, hash_bucket(c, k));
+ lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
+
+ b->flags = 0;
+ b->level = level;
+ b->written = 0;
+ b->nsets = 0;
+ for (int i = 0; i < MAX_BSETS; i++)
+ b->sets[i].size = 0;
+ for (int i = 1; i < MAX_BSETS; i++)
+ b->sets[i].data = NULL;
+
+ return b;
+err:
+ if (current->bio_list)
+ return ERR_PTR(-EAGAIN);
+
+ if (!cl)
+ return ERR_PTR(-ENOMEM);
+
+ if (c->try_harder && c->try_harder != cl) {
+ closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* XXX: tracepoint */
+ c->try_harder = cl;
+ c->try_harder_start = local_clock();
+ b = ERR_PTR(-ENOMEM);
+
+ list_for_each_entry_reverse(i, &c->btree_cache, list)
+ if (page_order <= i->page_order) {
+ int e = mca_reap(i, cl);
+ if (e == -EAGAIN)
+ b = ERR_PTR(-EAGAIN);
+ if (!e) {
+ b = i;
+ goto out;
+ }
+ }
+
+ if (b == ERR_PTR(-EAGAIN) &&
+ closure_blocking(cl)) {
+ mutex_unlock(&c->bucket_lock);
+ closure_sync(cl);
+ mutex_lock(&c->bucket_lock);
+ goto retry;
+ }
+
+ return b;
+}
+
+struct btree *get_bucket(struct cache_set *c, struct bkey *k,
+ int level, struct btree_op *op)
+{
+ int i = 0;
+ bool write = level <= op->lock;
+ struct btree *b;
+
+ BUG_ON(level < 0);
+retry:
+ b = find_bucket(c, k);
+
+ if (!b) {
+ mutex_lock(&c->bucket_lock);
+ b = alloc_bucket(c, k, level, &op->cl);
+ mutex_unlock(&c->bucket_lock);
+
+ if (!b)
+ goto retry;
+ if (IS_ERR(b))
+ return b;
+
+ btree_read(b);
+
+ if (!write)
+ downgrade_write(&b->lock);
+ } else {
+ rw_lock(write, b, level);
+ if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
+ rw_unlock(write, b);
+ goto retry;
+ }
+ BUG_ON(b->level != level);
+ }
+
+ b->accessed = 1;
+
+ for (; i <= b->nsets && b->sets[i].size; i++) {
+ prefetch(b->sets[i].tree);
+ prefetch(b->sets[i].data);
+ }
+
+ for (; i <= b->nsets; i++)
+ prefetch(b->sets[i].data);
+
+ if (!closure_wait_event(&b->io.wait, &op->cl,
+ btree_node_read_done(b))) {
+ rw_unlock(write, b);
+ b = ERR_PTR(-EAGAIN);
+ } else if (btree_node_io_error(b)) {
+ rw_unlock(write, b);
+ b = ERR_PTR(-EIO);
+ } else
+ BUG_ON(!b->written);
+
+ return b;
+}
+
+static void prefetch_bucket(struct cache_set *c, struct bkey *k, int level)
+{
+ struct btree *b;
+
+ mutex_lock(&c->bucket_lock);
+ b = alloc_bucket(c, k, level, NULL);
+ mutex_unlock(&c->bucket_lock);
+
+ if (!IS_ERR_OR_NULL(b)) {
+ btree_read(b);
+ rw_unlock(true, b);
+ }
+}
+
+/* Btree alloc */
+
+static void btree_free(struct btree *b, struct btree_op *op)
+{
+ /* The BUG_ON() in get_bucket() implies that we must have a write lock
+ * on parent to free or even invalidate a node
+ */
+ BUG_ON(op->lock <= b->level);
+ BUG_ON(b == b->c->root);
+ pr_debug("bucket %s", pbtree(b));
+
+ if (btree_node_dirty(b))
+ btree_complete_write(b, btree_current_write(b));
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+
+ if (b->prio_blocked &&
+ !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
+ closure_wake_up(&b->c->bucket_wait);
+
+ b->prio_blocked = 0;
+
+ __cancel_delayed_work(&b->work);
+
+ mutex_lock(&b->c->bucket_lock);
+
+ for (unsigned i = 0; i < KEY_PTRS(&b->key); i++) {
+ BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
+
+ inc_gen(PTR_CACHE(b->c, &b->key, i),
+ PTR_BUCKET(b->c, &b->key, i));
+ }
+
+ unpop_bucket(b->c, &b->key);
+ mca_bucket_free(b);
+ mutex_unlock(&b->c->bucket_lock);
+}
+
+struct btree *bcache_btree_alloc(struct cache_set *c, int level,
+ struct closure *cl)
+{
+ BKEY_PADDED(key) k;
+ struct btree *b = ERR_PTR(-EAGAIN);
+
+ mutex_lock(&c->bucket_lock);
+retry:
+ if (__pop_bucket_set(c, btree_prio, &k.key, 1, cl))
+ goto err;
+
+ SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+
+ b = alloc_bucket(c, &k.key, level, cl);
+ if (IS_ERR(b))
+ goto err_free;
+
+ if (!b) {
+ cache_bug(c, "Tried to allocate bucket"
+ " that was in btree cache");
+ __bkey_put(c, &k.key);
+ goto retry;
+ }
+
+ set_btree_node_read_done(b);
+ b->accessed = 1;
+ bset_init_next(b);
+
+ mutex_unlock(&c->bucket_lock);
+ return b;
+err_free:
+ unpop_bucket(c, &k.key);
+ __bkey_put(c, &k.key);
+err:
+ mutex_unlock(&c->bucket_lock);
+ return b;
+}
+
+static struct btree *btree_alloc_replacement(struct btree *b,
+ struct closure *cl)
+{
+ struct btree *n = bcache_btree_alloc(b->c, b->level, cl);
+ if (!IS_ERR_OR_NULL(n))
+ btree_sort_into(b, n);
+
+ return n;
+}
+
+/* Garbage collection */
+
+void __btree_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+ struct bucket *g;
+
+ if (!k->key || !KEY_SIZE(k))
+ return;
+
+ for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+ if (!ptr_available(c, k, i))
+ continue;
+
+ g = PTR_BUCKET(c, k, i);
+
+ if (gen_after(g->gc_gen, PTR_GEN(k, i)))
+ g->gc_gen = PTR_GEN(k, i);
+
+ if (ptr_stale(c, k, i))
+ continue;
+
+ cache_bug_on(level
+ ? g->mark && g->mark != GC_MARK_BTREE
+ : g->mark < GC_MARK_DIRTY, c,
+ "inconsistent pointers: mark = %i, "
+ "level = %i", g->mark, level);
+
+ if (level)
+ g->mark = GC_MARK_BTREE;
+ else if (KEY_DIRTY(k))
+ g->mark = GC_MARK_DIRTY;
+ else if (g->mark >= 0 &&
+ ((int) g->mark) + KEY_SIZE(k) < SHRT_MAX)
+ g->mark += KEY_SIZE(k);
+ }
+}
+
+#define btree_mark_key(b, k) __btree_mark_key(b->c, b->level, k)
+
+static int btree_gc_mark(struct btree *b, unsigned *keys, struct gc_stat *gc)
+{
+ uint8_t stale = 0;
+ unsigned last_dev = -1;
+ struct bcache_device *d = NULL;
+
+ struct btree_iter iter;
+ btree_iter_init(b, &iter, NULL);
+
+ gc->nodes++;
+
+ while (1) {
+ struct bkey *k = btree_iter_next(&iter);
+ if (!k)
+ break;
+
+ if (last_dev != KEY_DEV(k)) {
+ last_dev = KEY_DEV(k);
+
+ d = b->c->devices[last_dev];
+ }
+
+ if (ptr_invalid(b, k))
+ continue;
+
+ for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+ if (!ptr_available(b->c, k, i))
+ continue;
+
+ stale = max(stale, ptr_stale(b->c, k, i));
+
+ btree_bug_on(gen_after(PTR_BUCKET(b->c, k, i)->last_gc,
+ PTR_GEN(k, i)),
+ b, "found old gen %u > %u in gc: %s",
+ PTR_BUCKET(b->c, k, i)->last_gc,
+ PTR_GEN(k, i), pkey(k));
+ }
+
+ btree_mark_key(b, k);
+
+ if (ptr_bad(b, k))
+ continue;
+
+ *keys += bkey_u64s(k);
+
+ gc->key_bytes += bkey_u64s(k);
+ gc->nkeys++;
+
+ gc->data += KEY_SIZE(k);
+ if (KEY_DIRTY(k)) {
+ gc->dirty += KEY_SIZE(k);
+ if (d)
+ d->sectors_dirty_gc += KEY_SIZE(k);
+ }
+ }
+
+ for (struct bset_tree *t = b->sets; t <= &b->sets[b->nsets]; t++)
+ btree_bug_on(t->size &&
+ bset_written(b, t) &&
+ bkey_cmp(&b->key, &t->end) < 0,
+ b, "found short btree key in gc");
+
+ return stale;
+}
+
+static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
+ struct btree_op *op)
+{
+ /*
+ * We block priorities from being written for the duration of garbage
+ * collection, so we can't sleep in btree_alloc() -> pop_bucket(), or
+ * we'd risk deadlock - so we don't pass it our closure.
+ */
+ struct btree *n = btree_alloc_replacement(b, NULL);
+
+ if (!IS_ERR_OR_NULL(n)) {
+ swap(b, n);
+
+ memcpy(k->ptr, b->key.ptr,
+ sizeof(uint64_t) * KEY_PTRS(&b->key));
+
+ __bkey_put(b->c, &b->key);
+ atomic_inc(&b->c->prio_blocked);
+ b->prio_blocked++;
+
+ btree_free(n, op);
+ __up_write(&n->lock);
+
+ rwsem_release(&b->lock.dep_map, 1, _THIS_IP_);
+ }
+
+ return b;
+}
+
+/*
+ * Leaving this at 2 until we've got incremental garbage collection done; it
+ * could be higher (and has been tested with 4) except that garbage collection
+ * could take much longer, adversely affecting latency.
+ */
+#define GC_MERGE_NODES 2
+
+struct gc_merge_info {
+ struct btree *b;
+ struct bkey *k;
+ unsigned keys;
+};
+
+static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
+ struct gc_stat *gc, struct gc_merge_info *r)
+{
+ unsigned nodes = 0, keys = 0, blocks;
+
+ while (nodes < GC_MERGE_NODES && r[nodes].b)
+ keys += r[nodes++].keys;
+
+ blocks = btree_default_blocks(b->c) * 2 / 3;
+
+ if (nodes < 2 ||
+ __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+ return;
+
+ for (int i = nodes - 1; i >= 0; --i) {
+ if (r[i].b->written)
+ r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
+
+ if (r[i].b->written)
+ return;
+ }
+
+ for (int i = nodes - 1; i > 0; --i) {
+ struct bset *n1 = r[i].b->sets->data;
+ struct bset *n2 = r[i - 1].b->sets->data;
+ struct bkey *last = NULL;
+
+ keys = 0;
+
+ if (i == 1) {
+ /*
+ * Last node we're not getting rid of - we're getting
+ * rid of the node at r[0]. Have to try and fit all of
+ * the remaining keys into this node; we can't ensure
+ * they will always fit due to rounding and variable
+ * length keys (shouldn't be possible in practice,
+ * though)
+ */
+ if (__set_blocks(n1, n1->keys + r->keys,
+ b->c) > btree_blocks(r[i].b))
+ return;
+
+ keys = n2->keys;
+ last = &r->b->key;
+ } else
+ for (struct bkey *k = n2->start;
+ k < end(n2);
+ k = next(k)) {
+ if (__set_blocks(n1, n1->keys + keys +
+ bkey_u64s(k), b->c) > blocks)
+ break;
+
+ last = k;
+ keys += bkey_u64s(k);
+ }
+
+ BUG_ON(__set_blocks(n1, n1->keys + keys,
+ b->c) > btree_blocks(r[i].b));
+
+ if (last) {
+ bkey_copy_key(&r[i].b->key, last);
+ bkey_copy_key(r[i].k, last);
+ }
+
+ memcpy(end(n1),
+ n2->start,
+ (void *) node(n2, keys) - (void *) n2->start);
+
+ n1->keys += keys;
+
+ memmove(n2->start,
+ node(n2, keys),
+ (void *) end(n2) - (void *) node(n2, keys));
+
+ n2->keys -= keys;
+
+ r[i].keys = n1->keys;
+ r[i - 1].keys = n2->keys;
+ }
+
+ btree_free(r->b, op);
+ __up_write(&r->b->lock);
+
+ pr_debug("coalesced %u nodes", nodes);
+
+ gc->nodes--;
+ nodes--;
+
+ memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
+ memset(&r[nodes], 0, sizeof(struct gc_merge_info));
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+ struct closure *writes, struct gc_stat *gc)
+{
+ void write(struct btree *r)
+ {
+ if (!r->written)
+ btree_write(r, true, op);
+ else if (btree_node_dirty(r)) {
+ BUG_ON(btree_current_write(r)->owner);
+ btree_current_write(r)->owner = writes;
+ closure_get(writes);
+
+ btree_write(r, true, NULL);
+ }
+
+ __up_write(&r->lock);
+ }
+
+ int ret = 0, stale;
+ struct gc_merge_info r[GC_MERGE_NODES];
+
+ memset(r, 0, sizeof(r));
+
+ while ((r->k = next_recurse_key(b, &b->c->gc_done))) {
+ r->b = get_bucket(b->c, r->k, b->level - 1, op);
+
+ if (IS_ERR(r->b)) {
+ ret = PTR_ERR(r->b);
+ break;
+ }
+
+ /*
+ * Fake out lockdep, because I'm a terrible person: it's just
+ * not possible to express our lock ordering to lockdep, because
+ * lockdep works at most in terms of a small fixed number of
+ * subclasses, and we're just iterating through all of them in a
+ * fixed order.
+ */
+ rwsem_release(&r->b->lock.dep_map, 1, _THIS_IP_);
+
+ r->keys = 0;
+ stale = btree_gc_mark(r->b, &r->keys, gc);
+
+ if (!b->written &&
+ (r->b->level || stale > 10 ||
+ b->c->gc_always_rewrite))
+ r->b = btree_gc_alloc(r->b, r->k, op);
+
+ if (r->b->level)
+ ret = btree_gc_recurse(r->b, op, writes, gc);
+
+ if (ret) {
+ write(r->b);
+ break;
+ }
+
+ bkey_copy_key(&b->c->gc_done, r->k);
+
+ if (!b->written)
+ btree_gc_coalesce(b, op, gc, r);
+
+ if (r[GC_MERGE_NODES - 1].b)
+ write(r[GC_MERGE_NODES - 1].b);
+
+ memmove(&r[1], &r[0],
+ sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
+
+ /* When we've got incremental GC working, we'll want to do
+ * if (should_resched())
+ * return -EAGAIN;
+ */
+ cond_resched();
+#if 0
+ if (need_resched()) {
+ ret = -EAGAIN;
+ break;
+ }
+#endif
+ }
+
+ for (unsigned i = 1; i < GC_MERGE_NODES && r[i].b; i++)
+ write(r[i].b);
+
+ /* Might have freed some children, must remove their keys */
+ if (!b->written)
+ btree_sort(b);
+
+ return ret;
+}
+
+static int btree_gc_root(struct btree *b, struct btree_op *op,
+ struct closure *writes, struct gc_stat *gc)
+{
+ struct btree *n = NULL;
+ unsigned keys = 0;
+ int ret = 0, stale = btree_gc_mark(b, &keys, gc);
+
+ if (b->level || stale > 10)
+ n = btree_alloc_replacement(b, NULL);
+
+ if (!IS_ERR_OR_NULL(n))
+ swap(b, n);
+
+ if (b->level)
+ ret = btree_gc_recurse(b, op, writes, gc);
+
+ if (!b->written || btree_node_dirty(b)) {
+ atomic_inc(&b->c->prio_blocked);
+ b->prio_blocked++;
+ btree_write(b, true, n ? op : NULL);
+ }
+
+ if (!IS_ERR_OR_NULL(n)) {
+ closure_sync(&op->cl);
+ bcache_btree_set_root(b);
+ btree_free(n, op);
+ rw_unlock(true, b);
+ }
+
+ return ret;
+}
+
+size_t btree_gc_finish(struct cache_set *c)
+{
+ void mark_key(struct bkey *k)
+ {
+ for (unsigned i = 0; i < KEY_PTRS(k); i++)
+ PTR_BUCKET(c, k, i)->mark = GC_MARK_BTREE;
+ }
+
+ size_t available = 0;
+ struct bucket *b;
+ struct cache *ca;
+ uint64_t *i;
+
+ mutex_lock(&c->bucket_lock);
+
+ set_gc_sectors(c);
+ c->gc_mark_valid = 1;
+ c->need_gc = 0;
+ c->min_prio = initial_prio;
+
+ if (c->root)
+ mark_key(&c->root->key);
+
+ mark_key(&c->uuid_bucket);
+
+ for_each_cache(ca, c) {
+ ca->invalidate_needs_gc = 0;
+
+ for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
+ ca->buckets[*i].mark = GC_MARK_BTREE;
+
+ for (i = ca->prio_buckets;
+ i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+ ca->buckets[*i].mark = GC_MARK_BTREE;
+
+ for_each_bucket(b, ca) {
+ /*
+ * the c->journal.cur check is a hack because when we're
+ * called from run_cache_set() gc_gen isn't going to be
+ * correct
+ */
+ cache_bug_on(c->journal.cur &&
+ gen_after(b->last_gc, b->gc_gen), c,
+ "found old gen in gc");
+
+ b->last_gc = b->gc_gen;
+ b->gc_gen = b->gen;
+ c->need_gc = max(c->need_gc, bucket_gc_gen(b));
+
+ if (!atomic_read(&b->pin) &&
+ b->mark >= 0) {
+ available++;
+ if (!b->mark)
+ bucket_add_unused(ca, b);
+ }
+
+ if (b->prio)
+ c->min_prio = min(c->min_prio, b->prio);
+ }
+ }
+
+ for (struct bcache_device **d = c->devices;
+ d < c->devices + c->nr_uuids;
+ d++)
+ if (*d) {
+ unsigned long last =
+ atomic_long_read(&((*d)->sectors_dirty));
+ long difference = (*d)->sectors_dirty_gc - last;
+
+ pr_debug("sectors dirty off by %li", difference);
+
+ (*d)->sectors_dirty_last += difference;
+
+ atomic_long_set(&((*d)->sectors_dirty),
+ (*d)->sectors_dirty_gc);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+ return available;
+}
+
+static void btree_gc(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
+ int ret;
+ unsigned long available;
+ struct bucket *b;
+ struct cache *ca;
+
+ struct gc_stat stats;
+ struct closure writes;
+ struct btree_op op;
+
+ uint64_t start_time = local_clock();
+ trace_bcache_gc_start(c->sb.set_uuid);
+
+ memset(&stats, 0, sizeof(struct gc_stat));
+ closure_init_stack(&writes);
+ btree_op_init_stack(&op);
+ op.lock = SHRT_MAX;
+
+ blktrace_msg_all(c, "Starting gc");
+
+ mutex_lock(&c->bucket_lock);
+ for_each_cache(ca, c)
+ free_some_buckets(ca);
+
+ if (c->gc_mark_valid) {
+ c->gc_mark_valid = 0;
+ c->gc_done = ZERO_KEY;
+
+ for_each_cache(ca, c)
+ for_each_bucket(b, ca)
+ if (!atomic_read(&b->pin))
+ b->mark = 0;
+
+ for (struct bcache_device **d = c->devices;
+ d < c->devices + c->nr_uuids;
+ d++)
+ if (*d)
+ (*d)->sectors_dirty_gc = 0;
+ }
+ mutex_unlock(&c->bucket_lock);
+
+ ret = btree_root(gc_root, c, &op, &writes, &stats);
+ closure_sync(&op.cl);
+ closure_sync(&writes);
+
+ if (ret) {
+ blktrace_msg_all(c, "Stopped gc");
+ printk(KERN_WARNING "bcache: gc failed!\n");
+
+ continue_at(cl, btree_gc, bcache_wq);
+ }
+
+ /* Possibly wait for new UUIDs or whatever to hit disk */
+ bcache_journal_meta(c, &op.cl);
+ closure_sync(&op.cl);
+
+ available = btree_gc_finish(c);
+
+ time_stats_update(&c->btree_gc_time, start_time);
+
+ stats.key_bytes *= sizeof(uint64_t);
+ stats.dirty <<= 9;
+ stats.data <<= 9;
+ stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
+ memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
+ blktrace_msg_all(c, "Finished gc");
+
+ trace_bcache_gc_end(c->sb.set_uuid);
+ closure_wake_up(&c->bucket_wait);
+
+ closure_return(cl);
+}
+
+void bcache_queue_gc(struct cache_set *c)
+{
+ if (closure_trylock(&c->gc.cl, &c->cl))
+ continue_at(&c->gc.cl, btree_gc, bcache_wq);
+}
+
+/* Initial partial gc */
+
+static int btree_check_recurse(struct btree *b, struct btree_op *op,
+ unsigned long **seen)
+{
+ int ret;
+ struct bkey *k;
+ struct bucket *g;
+
+ for_each_key_filter(b, k, ptr_invalid) {
+ for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+ if (!ptr_available(b->c, k, i))
+ continue;
+
+ g = PTR_BUCKET(b->c, k, i);
+
+ if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
+ seen[PTR_DEV(k, i)]) ||
+ !ptr_stale(b->c, k, i)) {
+ g->gen = PTR_GEN(k, i);
+
+ if (b->level)
+ g->prio = btree_prio;
+ else if (g->prio == btree_prio)
+ g->prio = initial_prio;
+ }
+ }
+
+ btree_mark_key(b, k);
+ }
+
+ if (b->level) {
+ k = next_recurse_key(b, &ZERO_KEY);
+
+ while (k) {
+ struct bkey *p = next_recurse_key(b, k);
+ if (p)
+ prefetch_bucket(b->c, p, b->level - 1);
+
+ ret = btree(check_recurse, k, b, op, seen);
+ if (ret)
+ return ret;
+
+ k = p;
+ }
+ }
+
+ return 0;
+}
+
+int btree_check(struct cache_set *c, struct btree_op *op)
+{
+ int ret = -ENOMEM;
+ unsigned long *seen[MAX_CACHES_PER_SET];
+
+ memset(seen, 0, sizeof(seen));
+
+ for (int i = 0; c->cache[i]; i++) {
+ size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
+ seen[i] = kmalloc(n, GFP_KERNEL);
+ if (!seen[i])
+ goto err;
+
+ /* Disables the seen array until prio_read() uses it too */
+ memset(seen[i], 0xFF, n);
+ }
+
+ ret = btree_root(check_recurse, c, op, seen);
+err:
+ for (int i = 0; i < MAX_CACHES_PER_SET; i++)
+ kfree(seen[i]);
+ return ret;
+}
+
+/* Btree insertion */
+
+static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
+{
+ struct bset *i = b->sets[b->nsets].data;
+
+ memmove((uint64_t *) where + bkey_u64s(insert),
+ where,
+ (void *) end(i) - (void *) where);
+
+ i->keys += bkey_u64s(insert);
+ bkey_copy(where, insert);
+ bset_fix_lookup_table(b, where);
+}
+
+static bool fix_overlapping_extents(struct btree *b,
+ struct bkey *insert,
+ struct btree_iter *iter,
+ struct btree_op *op)
+{
+ void subtract_dirty(struct bkey *k, int sectors)
+ {
+ struct bcache_device *d = b->c->devices[KEY_DEV(k)];
+
+ if (KEY_DIRTY(k) && d)
+ atomic_long_sub(sectors, &d->sectors_dirty);
+ }
+
+ unsigned sectors_found = 0;
+
+ while (1) {
+ struct bkey *k = btree_iter_next(iter);
+ if (!k ||
+ bkey_cmp(insert, &START_KEY(k)) <= 0)
+ break;
+
+ if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+ continue;
+
+ if (op->type == BTREE_REPLACE) {
+ uint64_t offset = k->key - op->replace.key;
+ offset <<= 8;
+
+ BUG_ON(!KEY_PTRS(&op->replace));
+
+ if (KEY_START(k) > KEY_START(insert) + sectors_found)
+ goto check_failed;
+
+ if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
+ goto check_failed;
+
+ for (unsigned i = 0; i < KEY_PTRS(&op->replace); i++)
+ if (k->ptr[i] + offset != op->replace.ptr[i])
+ goto check_failed;
+
+ sectors_found = k->key - KEY_START(insert);
+ }
+
+ if (bkey_cmp(insert, k) < 0 &&
+ bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+ /*
+ * We overlapped in the middle of an existing key: that
+ * means we have to split the old key. But we have to do
+ * slightly different things depending on whether the
+ * old key has been written out yet.
+ */
+
+ struct bkey *top;
+
+ subtract_dirty(k, KEY_SIZE(insert));
+
+ if (bkey_written(b, k)) {
+ /*
+ * We insert a new key to cover the top of the
+ * old key, and the old key is modified in place
+ * to represent the bottom split.
+ *
+ * It's completely arbitrary whether the new key
+ * is the top or the bottom, but it has to match
+ * up with what btree_sort_fixup() does - it
+ * doesn't check for this kind of overlap, it
+ * depends on us inserting a new key for the top
+ * here.
+ */
+ top = bset_search(b, &b->sets[b->nsets],
+ insert);
+ shift_keys(b, top, k);
+ } else {
+ BKEY_PADDED(key) temp;
+ bkey_copy(&temp.key, k);
+ shift_keys(b, k, &temp.key);
+ top = next(k);
+ }
+
+ cut_front(insert, top);
+ cut_back(&START_KEY(insert), k);
+ bset_fix_invalidated_key(b, k);
+ return false;
+ }
+
+ if (bkey_cmp(insert, k) < 0) {
+ if (bkey_cmp(insert, &START_KEY(k)) > 0)
+ subtract_dirty(k, insert->key - KEY_START(k));
+
+ cut_front(insert, k);
+ } else {
+ if (bkey_cmp(k, &START_KEY(insert)) > 0)
+ subtract_dirty(k, k->key - KEY_START(insert));
+
+ if (bkey_written(b, k) &&
+ bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0)
+ /*
+ * Completely overwrote, so we don't have to
+ * invalidate the binary search tree
+ */
+ cut_front(k, k);
+ else {
+ __cut_back(&START_KEY(insert), k);
+ bset_fix_invalidated_key(b, k);
+ }
+ }
+ }
+
+check_failed:
+ if (op->type == BTREE_REPLACE &&
+ sectors_found < KEY_SIZE(insert)) {
+ insert->key -= KEY_SIZE(insert) - sectors_found;
+ SET_KEY_SIZE(insert, sectors_found);
+
+ if (!sectors_found) {
+ op->insert_collision = true;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool btree_insert_key(struct btree *b, struct btree_op *op,
+ struct bkey *k)
+{
+ struct bset *i = b->sets[b->nsets].data;
+ struct bkey *m, *prev;
+ const char *status = "insert";
+
+ BUG_ON(bkey_cmp(k, &b->key) > 0);
+ BUG_ON(b->level && !KEY_PTRS(k));
+ BUG_ON(!b->level && !k->key);
+
+ if (!b->level) {
+ struct btree_iter iter;
+ struct bkey search = KEY(KEY_DEV(k), KEY_START(k), 0);
+
+ /*
+ * bset_search() returns the first key that is strictly greater
+ * than the search key - but for back merging, we want to find
+ * the first key that is greater than or equal to KEY_START(k) -
+ * unless KEY_START(k) is 0.
+ */
+ if (search.key)
+ search.key--;
+
+ prev = NULL;
+ m = btree_iter_init(b, &iter, &search);
+
+ if (fix_overlapping_extents(b, k, &iter, op))
+ return false;
+
+ while (m != end(i) &&
+ bkey_cmp(k, &START_KEY(m)) > 0)
+ prev = m, m = next(m);
+
+ if (key_merging_disabled(b->c))
+ goto insert;
+
+ /* prev is in the tree, if we merge we're done */
+ status = "back merging";
+ if (prev &&
+ bkey_try_merge(b, prev, k))
+ goto merged;
+
+ status = "overwrote front";
+ if (m != end(i) &&
+ KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+ goto copy;
+
+ status = "front merge";
+ if (m != end(i) &&
+ bkey_try_merge(b, k, m))
+ goto copy;
+ } else
+ m = bset_search(b, &b->sets[b->nsets], k);
+
+insert: shift_keys(b, m, k);
+copy: bkey_copy(m, k);
+merged:
+ check_keys(b, "%s for %s at %s: %s", status,
+ op_type(op), pbtree(b), pkey(k));
+ check_key_order_msg(b, i, "%s for %s at %s: %s", status,
+ op_type(op), pbtree(b), pkey(k));
+
+ if (b->level && !k->key)
+ b->prio_blocked++;
+
+ pr_debug("%s for %s at %s: %s", status,
+ op_type(op), pbtree(b), pkey(k));
+
+ return true;
+}
+
+bool bcache_btree_insert_keys(struct btree *b, struct btree_op *op)
+{
+ /* If a read generates a cache miss, and a write to the same location
+ * finishes before the new data is added to the cache, the write will
+ * be overwritten with stale data. We can catch this by never
+ * overwriting good data if it came from a read.
+ */
+ bool ret = false;
+ struct bkey *k;
+ unsigned oldsize = count_data(b);
+
+ while ((k = keylist_pop(&op->keys))) {
+ bkey_put(b->c, k, b->level);
+ ret |= btree_insert_key(b, op, k);
+ }
+
+ BUG_ON(count_data(b) < oldsize);
+ return ret;
+}
+
+bool btree_insert_check_key(struct btree *b, struct btree_op *op,
+ struct bio *bio)
+{
+ bool ret = false;
+ uint64_t btree_ptr = b->key.ptr[0];
+ unsigned long seq = b->seq;
+ BKEY_PADDED(k) tmp;
+
+ rw_unlock(false, b);
+ rw_lock(true, b, b->level);
+
+ if (b->key.ptr[0] != btree_ptr ||
+ b->seq != seq + 1 ||
+ should_split(b))
+ goto out;
+
+ op->replace = KEY(op->d->id, bio_end(bio), bio_sectors(bio));
+
+ SET_KEY_PTRS(&op->replace, 1);
+ get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
+
+ SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
+
+ bkey_copy(&tmp.k, &op->replace);
+
+ BUG_ON(op->type != BTREE_INSERT);
+ BUG_ON(!btree_insert_key(b, op, &tmp.k));
+ btree_write(b, false, NULL);
+ ret = true;
+out:
+ downgrade_write(&b->lock);
+ return ret;
+}
+
+static int btree_split(struct btree *b, struct btree_op *op)
+{
+ bool split, root = b == b->c->root;
+ struct btree *n1, *n2 = NULL, *n3 = NULL;
+ uint64_t start_time = local_clock();
+
+ if (b->level)
+ set_closure_blocking(&op->cl);
+
+ n1 = btree_alloc_replacement(b, &op->cl);
+ if (IS_ERR(n1))
+ goto err;
+
+ split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+
+ pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
+ pbtree(b), n1->sets[0].data->keys);
+
+ if (split) {
+ unsigned keys = 0;
+
+ n2 = bcache_btree_alloc(b->c, b->level, &op->cl);
+ if (IS_ERR(n2))
+ goto err_free1;
+
+ if (root) {
+ n3 = bcache_btree_alloc(b->c, b->level + 1, &op->cl);
+ if (IS_ERR(n3))
+ goto err_free2;
+ }
+
+ bcache_btree_insert_keys(n1, op);
+
+ /* Has to be a linear search because we don't have an auxiliary
+ * search tree yet
+ */
+
+ while (keys < (n1->sets[0].data->keys * 3) / 5)
+ keys += bkey_u64s(node(n1->sets[0].data, keys));
+
+ bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
+ keys += bkey_u64s(node(n1->sets[0].data, keys));
+
+ n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
+ n1->sets[0].data->keys = keys;
+
+ memcpy(n2->sets[0].data->start,
+ end(n1->sets[0].data),
+ n2->sets[0].data->keys * sizeof(uint64_t));
+
+ bkey_copy_key(&n2->key, &b->key);
+
+ keylist_add(&op->keys, &n2->key);
+ btree_write(n2, true, op);
+ rw_unlock(true, n2);
+ } else
+ bcache_btree_insert_keys(n1, op);
+
+ keylist_add(&op->keys, &n1->key);
+ btree_write(n1, true, op);
+
+ if (n3) {
+ bkey_copy_key(&n3->key, &MAX_KEY);
+ bcache_btree_insert_keys(n3, op);
+ btree_write(n3, true, op);
+
+ closure_sync(&op->cl);
+ bcache_btree_set_root(n3);
+ rw_unlock(true, n3);
+ } else if (root) {
+ op->keys.top = op->keys.bottom;
+ closure_sync(&op->cl);
+ bcache_btree_set_root(n1);
+ } else {
+ bkey_copy(op->keys.top, &b->key);
+ bkey_copy_key(op->keys.top, &ZERO_KEY);
+
+ for (unsigned i = 0; i < KEY_PTRS(&b->key); i++) {
+ uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
+
+ SET_PTR_GEN(op->keys.top, i, g);
+ }
+
+ keylist_push(&op->keys);
+ closure_sync(&op->cl);
+ atomic_inc(&b->c->prio_blocked);
+ }
+
+ rw_unlock(true, n1);
+ btree_free(b, op);
+
+ time_stats_update(&b->c->btree_split_time, start_time);
+
+ return 0;
+err_free2:
+ __bkey_put(n2->c, &n2->key);
+ btree_free(n2, op);
+ rw_unlock(true, n2);
+err_free1:
+ __bkey_put(n1->c, &n1->key);
+ btree_free(n1, op);
+ rw_unlock(true, n1);
+err:
+ if (n3 == ERR_PTR(-EAGAIN) ||
+ n2 == ERR_PTR(-EAGAIN) ||
+ n1 == ERR_PTR(-EAGAIN))
+ return -EAGAIN;
+
+ printk(KERN_WARNING "bcache: couldn't split");
+ return -ENOMEM;
+}
+
+static int btree_insert_recurse(struct btree *b, struct btree_op *op,
+ struct keylist *stack_keys)
+{
+ if (b->level) {
+ int ret;
+ struct bkey *insert = op->keys.bottom;
+ struct bkey *k = next_recurse_key(b, &START_KEY(insert));
+
+ if (!k) {
+ btree_bug(b, "no key to recurse on at level %i/%i",
+ b->level, b->c->root->level);
+
+ op->keys.top = op->keys.bottom;
+ return -EIO;
+ }
+
+ if (bkey_cmp(insert, k) > 0) {
+ if (op->type == BTREE_REPLACE) {
+ __bkey_put(b->c, insert);
+ op->keys.top = op->keys.bottom;
+ op->insert_collision = true;
+ return 0;
+ }
+
+ for (unsigned i = 0; i < KEY_PTRS(insert); i++)
+ atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
+
+ bkey_copy(stack_keys->top, insert);
+
+ cut_back(k, insert);
+ cut_front(k, stack_keys->top);
+
+ keylist_push(stack_keys);
+ }
+
+ ret = btree(insert_recurse, k, b, op, stack_keys);
+ if (ret)
+ return ret;
+ }
+
+ if (!keylist_empty(&op->keys)) {
+ BUG_ON(!current_is_writer(&b->lock));
+
+ if (should_split(b)) {
+ if (op->lock <= b->c->root->level) {
+ BUG_ON(b->level);
+ op->lock = b->c->root->level + 1;
+ return -EINTR;
+ }
+ return btree_split(b, op);
+ }
+
+ BUG_ON(write_block(b) != b->sets[b->nsets].data);
+
+ if (bcache_btree_insert_keys(b, op))
+ btree_write(b, false, op);
+ }
+
+ return 0;
+}
+
+int bcache_btree_insert(struct btree_op *op, struct cache_set *c)
+{
+ int ret = 0;
+ struct cache *ca;
+ struct keylist stack_keys;
+
+ /*
+ * Don't want to block with the btree locked unless we have to,
+ * otherwise we get deadlocks with try_harder and between split/gc
+ */
+ clear_closure_blocking(&op->cl);
+
+ BUG_ON(keylist_empty(&op->keys));
+ keylist_copy(&stack_keys, &op->keys);
+ keylist_init(&op->keys);
+
+ while (c->need_gc > MAX_NEED_GC) {
+ closure_lock(&c->gc, &c->cl);
+ btree_gc(&c->gc.cl);
+ }
+
+ for_each_cache(ca, c)
+ while (ca->need_save_prio > MAX_SAVE_PRIO) {
+ mutex_lock(&c->bucket_lock);
+ free_some_buckets(ca);
+ mutex_unlock(&c->bucket_lock);
+
+ closure_wait_event_sync(&c->bucket_wait, &op->cl,
+ ca->need_save_prio <= MAX_SAVE_PRIO ||
+ can_save_prios(ca));
+ }
+
+ while (!keylist_empty(&stack_keys) ||
+ !keylist_empty(&op->keys)) {
+ if (keylist_empty(&op->keys)) {
+ keylist_add(&op->keys, keylist_pop(&stack_keys));
+ op->lock = 0;
+ }
+
+ ret = btree_root(insert_recurse, c, op, &stack_keys);
+
+ if (ret == -EAGAIN) {
+ ret = 0;
+ closure_sync(&op->cl);
+ } else if (ret) {
+ struct bkey *k;
+
+ printk(KERN_WARNING "bcache: error %i trying to "
+ "insert key for %s\n", ret, op_type(op));
+
+ while ((k = keylist_pop(&stack_keys) ?:
+ keylist_pop(&op->keys)))
+ bkey_put(c, k, 0);
+ }
+ }
+
+ keylist_free(&stack_keys);
+
+ if (op->journal)
+ atomic_dec_bug(op->journal);
+ op->journal = NULL;
+ return ret;
+}
+
+void bcache_btree_set_root(struct btree *b)
+{
+ BUG_ON(!b->written);
+ BUG_ON(!current_is_writer(&b->c->root->lock));
+
+ for (unsigned i = 0; i < KEY_PTRS(&b->key); i++)
+ BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != btree_prio);
+
+ mutex_lock(&b->c->bucket_lock);
+ list_del_init(&b->list);
+ mutex_unlock(&b->c->bucket_lock);
+
+ b->c->root = b;
+ __bkey_put(b->c, &b->key);
+
+ bcache_journal_meta(b->c, NULL);
+ pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
+}
+
+/* Cache lookup */
+
+static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
+ struct bkey *k)
+{
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = &s->bio.bio;
+ int ret = 0;
+
+ while (!ret &&
+ !op->lookup_done) {
+ unsigned sectors = INT_MAX;
+
+ if (KEY_DEV(k) == s->op.d->id) {
+ if (KEY_START(k) <= bio->bi_sector)
+ break;
+
+ sectors = min_t(uint64_t, sectors,
+ KEY_START(k) - bio->bi_sector);
+ }
+
+ ret = s->op.d->cache_miss(b, s, bio, sectors);
+ }
+
+ return ret;
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
+ struct bkey *k)
+{
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = &s->bio.bio;
+
+ unsigned sectors, ptr;
+ struct bio *n;
+
+ int ret = submit_partial_cache_miss(b, op, k);
+ if (ret || op->lookup_done)
+ return ret;
+
+ /* XXX: figure out best pointer - for multiple cache devices */
+ ptr = 0;
+
+ PTR_BUCKET(b->c, k, ptr)->prio = initial_prio;
+
+ while (!op->lookup_done &&
+ KEY_DEV(k) == s->op.d->id &&
+ bio->bi_sector < k->key) {
+ struct bkey *bio_key;
+ struct block_device *bdev = PTR_CACHE(b->c, k, ptr)->bdev;
+
+ sector_t sector = PTR_OFFSET(k, ptr) +
+ (bio->bi_sector - KEY_START(k));
+
+ sectors = min_t(unsigned, k->key - bio->bi_sector,
+ __bio_max_sectors(bio, bdev, sector));
+
+ n = bio_split_get(bio, sectors, op->d);
+ if (!n)
+ return -EAGAIN;
+
+ if (n == bio)
+ op->lookup_done = true;
+
+ bio_key = &container_of(n, struct bbio, bio)->key;
+
+ /*
+ * The bucket we're reading from might be reused while our bio
+ * is in flight, and we could then end up reading the wrong
+ * data.
+ *
+ * We guard against this by checking (in cache_read_endio()) if
+ * the pointer is stale again; if so, we treat it as an error
+ * and reread from the backing device (but we don't pass that
+ * error up anywhere).
+ */
+
+ bkey_copy_single_ptr(bio_key, k, ptr);
+ SET_PTR_OFFSET(bio_key, 0, sector);
+
+ n->bi_end_io = cache_read_endio;
+
+ trace_bcache_cache_hit(n);
+ __submit_bbio(n, b->c);
+ }
+
+ return 0;
+}
+
+int btree_search_recurse(struct btree *b, struct btree_op *op)
+{
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = &s->bio.bio;
+
+ int ret = 0;
+ struct bkey *k;
+ struct btree_iter iter;
+ btree_iter_init(b, &iter, &KEY(op->d->id, bio->bi_sector, 0));
+
+ pr_debug("at %s searching for %u:%llu", pbtree(b), op->d->id,
+ (uint64_t) bio->bi_sector);
+
+ do {
+ k = btree_iter_next(&iter);
+ if (!k) {
+ ret = submit_partial_cache_miss(b, op,
+ &KEY(KEY_DEV(&b->key), b->key.key, 0));
+ break;
+ }
+
+ if (ptr_bad(b, k))
+ continue;
+
+ ret = b->level
+ ? btree(search_recurse, k, b, op)
+ : submit_partial_cache_hit(b, op, k);
+ } while (!ret &&
+ !op->lookup_done);
+
+ return ret;
+}
+
+void bcache_btree_exit(void)
+{
+ if (btree_wq)
+ destroy_workqueue(btree_wq);
+}
+
+int __init bcache_btree_init(void)
+{
+ btree_wq = create_singlethread_workqueue("bcache_btree_io");
+ if (!btree_wq)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/block/bcache/btree.h b/drivers/block/bcache/btree.h
new file mode 100644
index 0000000..8fa9b57
--- /dev/null
+++ b/drivers/block/bcache/btree.h
@@ -0,0 +1,272 @@
+#ifndef _BCACHE_BTREE_H
+#define _BCACHE_BTREE_H
+
+#include "bset.h"
+#include "debug.h"
+
+struct btree_write {
+ struct closure *owner;
+ atomic_t *journal;
+
+ /* If btree_split() frees a btree node, it writes a new pointer to that
+ * btree node indicating it was freed; it takes a refcount on
+ * c->prio_blocked because we can't write the gens until the new
+ * pointer is on disk. This allows btree_write_endio() to release the
+ * refcount that btree_split() took.
+ */
+ int prio_blocked;
+};
+
+struct btree {
+ /* Hottest entries first */
+ struct hlist_node hash;
+
+ /* Key/pointer for this btree node */
+ BKEY_PADDED(key);
+
+ /* Single bit - set when accessed, cleared by shrinker */
+ unsigned long accessed;
+ unsigned long seq;
+ struct rw_semaphore lock;
+ struct cache_set *c;
+
+ unsigned long flags;
+ uint16_t written; /* would be nice to kill */
+ uint8_t level;
+ uint8_t nsets;
+ uint8_t page_order;
+
+ /*
+ * Set of sorted keys - the real btree node - plus a binary search tree
+ *
+ * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+ * to the memory we have allocated for this btree node. Additionally,
+ * set[0]->data points to the entire btree node as it exists on disk.
+ */
+ struct bset_tree sets[MAX_BSETS];
+
+ /* Used to refcount bio splits, also protects b->bio */
+ struct closure_with_waitlist io;
+
+ /* Gets transferred to w->prio_blocked - see the comment there */
+ int prio_blocked;
+
+ struct list_head list;
+ struct delayed_work work;
+
+ uint64_t io_start_time;
+ struct btree_write writes[2];
+ struct bio *bio;
+};
+
+#define BTREE_FLAG(flag) \
+static inline bool btree_node_ ## flag(struct btree *b) \
+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
+ \
+static inline void set_btree_node_ ## flag(struct btree *b) \
+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
+
+enum btree_flags {
+ BTREE_NODE_read_done,
+ BTREE_NODE_io_error,
+ BTREE_NODE_dirty,
+ BTREE_NODE_write_idx,
+};
+
+BTREE_FLAG(read_done);
+BTREE_FLAG(io_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+ return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+ return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline unsigned bset_offset(struct btree *b, struct bset *i)
+{
+ return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+}
+
+static inline struct bset *write_block(struct btree *b)
+{
+ return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+}
+
+static inline bool bset_written(struct btree *b, struct bset_tree *t)
+{
+ return t->data < write_block(b);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey *k)
+{
+ return k < write_block(b)->start;
+}
+
+static inline void set_gc_sectors(struct cache_set *c)
+{
+ atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
+}
+
+/* Looping macros */
+
+#define for_each_sorted_set_start(b, i, start) \
+ for (int _i = start; i = (b)->sets[_i].data, _i <= (b)->nsets; _i++)
+
+#define for_each_sorted_set(b, i) for_each_sorted_set_start(b, i, 0)
+
+#define bkey_filter(b, i, k, filter) \
+({ \
+ while (k < end(i) && filter(b, k)) \
+ k = next(k); \
+ k; \
+})
+
+#define all_keys(b, k) 0
+
+#define for_each_key_filter(b, k, filter) \
+ for (struct bset_tree *_t = (b)->sets; \
+ _t <= &(b)->sets[(b)->nsets]; \
+ _t++) \
+ for (k = _t->data->start; \
+ (k = bkey_filter(b, _t->data, k, filter)) \
+ < end(_t->data); \
+ k = next(k))
+
+#define for_each_key(b, k) for_each_key_filter(b, k, all_keys)
+
+/* Recursing down the btree */
+
+struct btree_op {
+ struct closure cl;
+ struct bcache_device *d;
+
+ /* Journal entry we have a refcount on */
+ atomic_t *journal;
+
+ /* Btree level at which we start taking write locks */
+ short lock;
+
+ /* Btree insertion type */
+ enum {
+ BTREE_INSERT,
+ BTREE_REPLACE
+ } type:8;
+
+ unsigned lookup_done:1;
+ unsigned insert_collision:1;
+ unsigned flush_journal:1;
+
+ /* Anything after this point won't get zeroed in do_bio_hook() */
+
+ /* Keys to be inserted */
+ struct keylist keys;
+ BKEY_PADDED(replace);
+};
+
+void btree_op_init_stack(struct btree_op *);
+
+static inline void rw_lock(bool w, struct btree *b, int level)
+{
+ w ? down_write_nested(&b->lock, level + 1)
+ : down_read_nested(&b->lock, level + 1);
+ if (w)
+ b->seq++;
+}
+
+static inline void rw_unlock(bool w, struct btree *b)
+{
+#ifdef CONFIG_BCACHE_EDEBUG
+ if (w &&
+ b->key.ptr[0] &&
+ btree_node_read_done(b))
+ for (unsigned i = 0; i <= b->nsets; i++)
+ check_key_order(b, b->sets[i].data);
+#endif
+
+ if (w)
+ b->seq++;
+ (w ? up_write : up_read)(&b->lock);
+}
+
+#define insert_lock(s, b) ((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+#define btree(f, k, b, op, ...) \
+({ \
+ int _r, l = (b)->level - 1; \
+ bool _w = l <= (op)->lock; \
+ struct btree *_b = get_bucket((b)->c, k, l, op); \
+ if (!IS_ERR(_b)) { \
+ _r = btree_ ## f(_b, op, ##__VA_ARGS__); \
+ rw_unlock(_w, _b); \
+ } else \
+ _r = PTR_ERR(_b); \
+ _r; \
+})
+
+#define btree_root(f, c, op, ...) \
+({ \
+ int _r = -EINTR; \
+ do { \
+ struct btree *_b = (c)->root; \
+ bool _w = insert_lock(op, _b); \
+ rw_lock(_w, _b, _b->level); \
+ if (_b == (c)->root && \
+ _w == insert_lock(op, _b)) \
+ _r = btree_ ## f(_b, op, ##__VA_ARGS__); \
+ rw_unlock(_w, _b); \
+ } while (_r == -EINTR); \
+ \
+ if ((c)->try_harder == &(op)->cl) { \
+ time_stats_update(&(c)->try_harder_time, \
+ (c)->try_harder_start); \
+ (c)->try_harder = NULL; \
+ __closure_wake_up(&(c)->try_wait); \
+ } \
+ _r; \
+})
+
+static inline bool should_split(struct btree *b)
+{
+ struct bset *i = write_block(b);
+ return b->written >= btree_blocks(b) ||
+ (i->seq == b->sets[0].data->seq &&
+ b->written + __set_blocks(i, i->keys + 15, b->c)
+ > btree_blocks(b));
+}
+
+void btree_read_done(struct closure *);
+void btree_read(struct btree *);
+void btree_write(struct btree *b, bool now, struct btree_op *op);
+
+void bcache_btree_set_root(struct btree *);
+struct btree *bcache_btree_alloc(struct cache_set *, int, struct closure *);
+struct btree *get_bucket(struct cache_set *, struct bkey *,
+ int, struct btree_op *);
+
+bool bcache_btree_insert_keys(struct btree *, struct btree_op *);
+bool btree_insert_check_key(struct btree *, struct btree_op *, struct bio *);
+int bcache_btree_insert(struct btree_op *, struct cache_set *);
+int btree_search_recurse(struct btree *, struct btree_op *);
+
+void bcache_queue_gc(struct cache_set *);
+size_t btree_gc_finish(struct cache_set *);
+int btree_check(struct cache_set *, struct btree_op *);
+void __btree_mark_key(struct cache_set *, int, struct bkey *);
+
+#endif
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 12/16] bcache: Bset code (lookups within a btree node)
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (6 preceding siblings ...)
2012-05-10 3:10 ` [Bcache v13 11/16] bcache: Core btree code Kent Overstreet
@ 2012-05-10 3:11 ` Kent Overstreet
[not found] ` <5b5998d7d09ec36377acdb5d15665d1e4e818521.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:11 ` [Bcache v13 13/16] bcache: Journalling Kent Overstreet
` (5 subsequent siblings)
13 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:11 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/bset.c | 1149 +++++++++++++++++++++++++++++++++++++++++++
drivers/block/bcache/bset.h | 218 ++++++++
2 files changed, 1367 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/bset.c
create mode 100644 drivers/block/bcache/bset.h
diff --git a/drivers/block/bcache/bset.c b/drivers/block/bcache/bset.c
new file mode 100644
index 0000000..5823c47
--- /dev/null
+++ b/drivers/block/bcache/bset.c
@@ -0,0 +1,1149 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+#include <linux/random.h>
+
+/* Keylists */
+
+void keylist_copy(struct keylist *dest, struct keylist *src)
+{
+ *dest = *src;
+
+ if (src->list == src->d) {
+ size_t n = (uint64_t *) src->top - src->d;
+ dest->top = (struct bkey *) &dest->d[n];
+ dest->list = dest->d;
+ }
+}
+
+int keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+{
+ unsigned oldsize = (uint64_t *) l->top - l->list;
+ unsigned newsize = oldsize + 2 + nptrs;
+ uint64_t *new;
+
+ /* The journalling code doesn't handle the case where the keys to insert
+ * is bigger than an empty write: If we just return -ENOMEM here,
+ * bio_insert() and bio_invalidate() will insert the keys created so far
+ * and finish the rest when the keylist is empty.
+ */
+ if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+ return -ENOMEM;
+
+ newsize = roundup_pow_of_two(newsize);
+
+ if (newsize <= KEYLIST_INLINE ||
+ roundup_pow_of_two(oldsize) == newsize)
+ return 0;
+
+ new = krealloc(l->list == l->d ? NULL : l->list,
+ sizeof(uint64_t) * newsize, GFP_NOIO);
+
+ if (!new)
+ return -ENOMEM;
+
+ if (l->list == l->d)
+ memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
+
+ l->list = new;
+ l->top = (struct bkey *) (&l->list[oldsize]);
+
+ return 0;
+}
+
+struct bkey *keylist_pop(struct keylist *l)
+{
+ struct bkey *k = l->bottom;
+
+ if (k == l->top)
+ return NULL;
+
+ while (next(k) != l->top)
+ k = next(k);
+
+ return l->top = k;
+}
+
+/* Pointer validation */
+
+bool __ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
+{
+ if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
+ goto bad;
+
+ if (!level && KEY_SIZE(k) > k->key)
+ goto bad;
+
+ if (!KEY_SIZE(k))
+ return true;
+
+ for (unsigned i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(c, k, i)) {
+ struct cache *ca = PTR_CACHE(c, k, i);
+ size_t bucket = PTR_BUCKET_NR(c, k, i);
+ size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+ if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+ bucket < ca->sb.first_bucket ||
+ bucket >= ca->sb.nbuckets)
+ goto bad;
+ }
+
+ return false;
+bad:
+ cache_bug(c, "spotted bad key %s: %s", pkey(k), ptr_status(c, k));
+ return true;
+}
+
+bool ptr_invalid(struct btree *b, const struct bkey *k)
+{
+ return __ptr_invalid(b->c, b->level, k);
+}
+
+bool ptr_bad(struct btree *b, const struct bkey *k)
+{
+ struct bucket *g;
+ unsigned i, stale;
+
+ if (!bkey_cmp(k, &ZERO_KEY) ||
+ !KEY_PTRS(k) ||
+ ptr_invalid(b, k))
+ return true;
+
+ if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
+ return true;
+
+ for (i = 0; i < KEY_PTRS(k); i++)
+ if (ptr_available(b->c, k, i)) {
+ g = PTR_BUCKET(b->c, k, i);
+ stale = ptr_stale(b->c, k, i);
+
+ btree_bug_on(stale > 96, b,
+ "key too stale: %i, need_gc %u",
+ stale, b->c->need_gc);
+
+ btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+ b, "stale dirty pointer");
+
+ if (stale)
+ return true;
+
+#ifdef CONFIG_BCACHE_EDEBUG
+ if (!mutex_trylock(&b->c->bucket_lock))
+ continue;
+
+ if (b->level) {
+ if (KEY_DIRTY(k) ||
+ g->prio != btree_prio ||
+ (b->c->gc_mark_valid &&
+ g->mark != GC_MARK_BTREE))
+ goto bug;
+
+ } else {
+ if (g->prio == btree_prio)
+ goto bug;
+
+ if (KEY_DIRTY(k) &&
+ b->c->gc_mark_valid &&
+ g->mark != GC_MARK_DIRTY)
+ goto bug;
+ }
+ mutex_unlock(&b->c->bucket_lock);
+#endif
+ }
+
+ return false;
+#ifdef CONFIG_BCACHE_EDEBUG
+bug:
+ mutex_unlock(&b->c->bucket_lock);
+ btree_bug(b, "inconsistent pointer %s: bucket %li pin %i "
+ "prio %i gen %i last_gc %i mark %i gc_gen %i", pkey(k),
+ PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+ g->prio, g->gen, g->last_gc, g->mark, g->gc_gen);
+ return true;
+#endif
+}
+
+/* Key/pointer manipulation */
+
+void bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, unsigned i)
+{
+ BUG_ON(i > KEY_PTRS(src));
+
+ /* Only copy the header, key, and one pointer. */
+ memcpy(dest, src, 2 * sizeof(uint64_t));
+ dest->ptr[0] = src->ptr[i];
+ SET_KEY_PTRS(dest, 1);
+ /* We didn't copy the checksum so clear that bit. */
+ SET_KEY_CSUM(dest, 0);
+}
+
+bool __cut_front(const struct bkey *where, struct bkey *k)
+{
+ unsigned len = 0;
+
+ if (bkey_cmp(where, &START_KEY(k)) <= 0)
+ return false;
+
+ if (bkey_cmp(where, k) < 0)
+ len = k->key - where->key;
+ else
+ bkey_copy_key(k, where);
+
+ for (unsigned i = 0; i < KEY_PTRS(k); i++)
+ SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
+
+ BUG_ON(len > KEY_SIZE(k));
+ SET_KEY_SIZE(k, len);
+ return true;
+}
+
+bool __cut_back(const struct bkey *where, struct bkey *k)
+{
+ unsigned len = 0;
+
+ if (bkey_cmp(where, k) >= 0)
+ return false;
+
+ BUG_ON(KEY_DEV(where) != KEY_DEV(k));
+
+ if (bkey_cmp(where, &START_KEY(k)) > 0)
+ len = where->key - KEY_START(k);
+
+ bkey_copy_key(k, where);
+
+ BUG_ON(len > KEY_SIZE(k));
+ SET_KEY_SIZE(k, len);
+ return true;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+ return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+ ~((uint64_t)1 << 63);
+}
+
+/* Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
+{
+ if (key_merging_disabled(b->c))
+ return false;
+
+ if (KEY_PTRS(l) != KEY_PTRS(r) ||
+ KEY_DIRTY(l) != KEY_DIRTY(r) ||
+ bkey_cmp(l, &START_KEY(r)))
+ return false;
+
+ for (unsigned j = 0; j < KEY_PTRS(l); j++)
+ if (l->ptr[j] + PTR(0, KEY_SIZE(l), 0) != r->ptr[j] ||
+ PTR_BUCKET_NR(b->c, l, j) != PTR_BUCKET_NR(b->c, r, j))
+ return false;
+
+ /* Keys with no pointers aren't restricted to one bucket and could
+ * overflow KEY_SIZE
+ */
+ if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+ l->key += USHRT_MAX - KEY_SIZE(l);
+ SET_KEY_SIZE(l, USHRT_MAX);
+
+ cut_front(l, r);
+ return false;
+ }
+
+ if (KEY_CSUM(l)) {
+ if (KEY_CSUM(r))
+ l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+ else
+ SET_KEY_CSUM(l, 0);
+ }
+
+ SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+ l->key += KEY_SIZE(r);
+
+ return true;
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+static unsigned inorder_next(unsigned j, unsigned size)
+{
+ if (j * 2 + 1 < size) {
+ j = j * 2 + 1;
+
+ while (j * 2 < size)
+ j *= 2;
+ } else
+ j >>= ffz(j) + 1;
+
+ return j;
+}
+
+static unsigned inorder_prev(unsigned j, unsigned size)
+{
+ if (j * 2 < size) {
+ j = j * 2;
+
+ while (j * 2 + 1 < size)
+ j = j * 2 + 1;
+ } else
+ j >>= ffs(j);
+
+ return j;
+}
+
+/* I have no idea why this code works... and I'm the one who wrote it
+ *
+ * However, I do know what it does:
+ * Given a binary tree constructed in an array (i.e. how you normally implement
+ * a heap), it converts a node in the tree - referenced by array index - to the
+ * index it would have if you did an inorder traversal.
+ *
+ * The binary tree starts at array index 1, not 0
+ * extra is a function of size:
+ * extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ */
+static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
+{
+ unsigned b = fls(j);
+ unsigned shift = fls(size - 1) - b;
+
+ j ^= 1U << (b - 1);
+ j <<= 1;
+ j |= 1;
+ j <<= shift;
+
+ if (j > extra)
+ j -= (j - extra) >> 1;
+
+ return j;
+}
+
+static unsigned to_inorder(unsigned j, struct bset_tree *t)
+{
+ return __to_inorder(j, t->size, t->extra);
+}
+
+static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
+{
+ unsigned shift;
+
+ if (j > extra)
+ j += j - extra;
+
+ shift = ffs(j);
+
+ j >>= shift;
+ j |= roundup_pow_of_two(size) >> shift;
+
+ return j;
+}
+
+static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
+{
+ return __inorder_to_tree(j, t->size, t->extra);
+}
+
+#if 0
+void inorder_test(void)
+{
+ unsigned long done = 0;
+ ktime_t start = ktime_get();
+
+ for (unsigned size = 2;
+ size < 65536000;
+ size++) {
+ unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ unsigned i = 1, j = rounddown_pow_of_two(size - 1);
+
+ if (!(size % 4096))
+ printk(KERN_NOTICE "loop %u, %llu per us\n", size,
+ done / ktime_us_delta(ktime_get(), start));
+
+ while (1) {
+ if (__inorder_to_tree(i, size, extra) != j)
+ panic("size %10u j %10u i %10u", size, j, i);
+
+ if (__to_inorder(j, size, extra) != i)
+ panic("size %10u j %10u i %10u", size, j, i);
+
+ if (j == rounddown_pow_of_two(size) - 1)
+ break;
+
+ BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
+
+ j = inorder_next(j, size);
+ i++;
+ }
+
+ done += size - 1;
+ }
+}
+#endif
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmatic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; to_inorder() gives us the cacheline, and then
+ * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
+ unsigned offset)
+{
+ return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
+{
+ return ((void *) k - (void *) t->data) / BSET_CACHELINE;
+}
+
+static unsigned bkey_to_cacheline_offset(struct bkey *k)
+{
+ return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+}
+
+static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
+{
+ return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
+}
+
+static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
+{
+ return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
+{
+ return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
+}
+
+/* Auxiliary search trees */
+
+static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
+{
+#ifdef CONFIG_X86_64
+ asm("shrd %[shift],%[high],%[low]"
+ : [low] "+Rm" (low)
+ : [high] "R" (high),
+ [shift] "ci" (shift)
+ : "cc");
+#else
+ low >>= shift;
+ low |= (high << 1) << (63U - shift);
+#endif
+ return low;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey *k,
+ struct bkey_float *f)
+{
+ const uint64_t *p = &k->key - (f->exponent >> 6);
+ return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
+}
+
+static void make_bfloat(struct bset_tree *t, unsigned j)
+{
+ struct bkey_float *f = &t->tree[j];
+ struct bkey *m = tree_to_bkey(t, j);
+ struct bkey *p = tree_to_prev_bkey(t, j);
+
+ struct bkey *l = is_power_of_2(j)
+ ? t->data->start
+ : tree_to_prev_bkey(t, j >> ffs(j));
+
+ struct bkey *r = is_power_of_2(j + 1)
+ ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+ : tree_to_bkey(t, j >> (ffz(j) + 1));
+
+ BUG_ON(m < l || m > r);
+ BUG_ON(next(p) != m);
+
+ if (KEY_DEV(l) != KEY_DEV(r))
+ f->exponent = fls64(KEY_DEV(r) ^ KEY_DEV(l)) + 64;
+ else
+ f->exponent = fls64(r->key ^ l->key);
+
+ f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
+
+ if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
+ f->mantissa = bfloat_mantissa(m, f) - 1;
+ else
+ f->exponent = 127;
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+ if (t != b->sets) {
+ unsigned j = roundup(t[-1].size,
+ 64 / sizeof(struct bkey_float));
+
+ t->tree = t[-1].tree + j;
+ t->prev = t[-1].prev + j;
+ }
+
+ while (t < b->sets + MAX_BSETS)
+ t++->size = 0;
+}
+
+static void bset_build_unwritten_tree(struct btree *b)
+{
+ struct bset_tree *t = b->sets + b->nsets;
+
+ bset_alloc_tree(b, t);
+
+ if (t->tree != b->sets->tree + bset_tree_space(b)) {
+ t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+ t->size = 1;
+ }
+}
+
+static void bset_build_written_tree(struct btree *b)
+{
+ struct bset_tree *t = b->sets + b->nsets;
+ struct bkey *k = t->data->start;
+ unsigned j, cacheline = 1;
+
+ bset_alloc_tree(b, t);
+
+ t->size = min_t(unsigned,
+ bkey_to_cacheline(t, end(t->data)),
+ b->sets->tree + bset_tree_space(b) - t->tree);
+
+ if (t->size < 2) {
+ t->size = 0;
+ return;
+ }
+
+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+ /* First we figure out where the first key in each cacheline is */
+ for (j = inorder_next(0, t->size);
+ j;
+ j = inorder_next(j, t->size)) {
+ while (bkey_to_cacheline(t, k) != cacheline)
+ k = next(k);
+
+ t->prev[j] = bkey_u64s(k);
+ k = next(k);
+ cacheline++;
+ t->tree[j].m = bkey_to_cacheline_offset(k);
+ }
+
+ while (next(k) != end(t->data))
+ k = next(k);
+
+ t->end = *k;
+
+ /* Then we build the tree */
+ for (j = inorder_next(0, t->size);
+ j;
+ j = inorder_next(j, t->size))
+ make_bfloat(t, j);
+}
+
+void bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+{
+ struct bset_tree *t;
+ unsigned inorder, j = 1;
+
+ for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+ if (k < end(t->data))
+ goto found_set;
+
+ BUG();
+found_set:
+ if (!t->size || !bset_written(b, t))
+ return;
+
+ inorder = bkey_to_cacheline(t, k);
+
+ if (k == t->data->start)
+ goto fix_left;
+
+ if (next(k) == end(t->data)) {
+ t->end = *k;
+ goto fix_right;
+ }
+
+ j = inorder_to_tree(inorder, t);
+
+ if (j &&
+ j < t->size &&
+ k == tree_to_bkey(t, j))
+fix_left: do {
+ make_bfloat(t, j);
+ j = j * 2;
+ } while (j < t->size);
+
+ j = inorder_to_tree(inorder + 1, t);
+
+ if (j &&
+ j < t->size &&
+ k == tree_to_prev_bkey(t, j))
+fix_right: do {
+ make_bfloat(t, j);
+ j = j * 2 + 1;
+ } while (j < t->size);
+}
+
+void bset_fix_lookup_table(struct btree *b, struct bkey *k)
+{
+ struct bset_tree *t = &b->sets[b->nsets];
+ unsigned shift = bkey_u64s(k);
+ unsigned j = bkey_to_cacheline(t, k);
+
+ /* We're getting called from btree_split() or btree_gc, just bail out */
+ if (!t->size)
+ return;
+
+ /* k is the key we just inserted; we need to find the entry in the
+ * lookup table for the first key that is strictly greater than k:
+ * it's either k's cacheline or the next one
+ */
+ if (j < t->size &&
+ table_to_bkey(t, j) <= k)
+ j++;
+
+ /* Adjust all the lookup table entries, and find a new key for any that
+ * have gotten too big
+ */
+ for (; j < t->size; j++) {
+ t->prev[j] += shift;
+
+ if (t->prev[j] > 7) {
+ k = table_to_bkey(t, j - 1);
+
+ while (k < cacheline_to_bkey(t, j, 0))
+ k = next(k);
+
+ t->prev[j] = bkey_to_cacheline_offset(k);
+ }
+ }
+
+ if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+ return;
+
+ /* Possibly add a new entry to the end of the lookup table */
+
+ for (k = table_to_bkey(t, t->size - 1);
+ k != end(t->data);
+ k = next(k))
+ if (t->size == bkey_to_cacheline(t, k)) {
+ t->prev[t->size] = bkey_to_cacheline_offset(k);
+ t->size++;
+ }
+}
+
+void bset_init_next(struct btree *b)
+{
+ struct bset *i = write_block(b);
+
+ if (i != b->sets[0].data) {
+ b->sets[++b->nsets].data = i;
+ i->seq = b->sets[0].data->seq;
+ } else
+ get_random_bytes(&i->seq, sizeof(uint64_t));
+
+ i->magic = bset_magic(b->c);
+ i->version = 0;
+ i->keys = 0;
+
+ bset_build_unwritten_tree(b);
+}
+
+struct bset_search_iter {
+ struct bkey *l, *r;
+};
+
+__attribute__((optimize(3)))
+static struct bset_search_iter bset_search_write_set(struct btree *b,
+ struct bset_tree *t,
+ const struct bkey *search)
+{
+ unsigned li = 0, ri = t->size;
+
+ BUG_ON(!b->nsets &&
+ t->size < bkey_to_cacheline(t, end(t->data)));
+
+ while (li + 1 != ri) {
+ unsigned m = (li + ri) >> 1;
+
+ if (bkey_cmp(table_to_bkey(t, m), search) > 0)
+ ri = m;
+ else
+ li = m;
+ }
+
+ return (struct bset_search_iter) {
+ table_to_bkey(t, li),
+ ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+ };
+}
+
+__attribute__((optimize(3)))
+static struct bset_search_iter bset_search_tree(struct btree *b,
+ struct bset_tree *t,
+ const struct bkey *search)
+{
+ struct bkey *l, *r;
+ struct bkey_float *f;
+ unsigned inorder, j, n = 1;
+
+ do {
+ unsigned p = n << 4;
+ p &= ((int) (p - t->size)) >> 31;
+
+ prefetch(&t->tree[p]);
+
+ j = n;
+ f = &t->tree[j];
+
+ /*
+ * n = (f->mantissa > bfloat_mantissa())
+ * ? j * 2
+ * : j * 2 + 1;
+ *
+ * We need to subtract 1 from f->mantissa for the sign bit trick
+ * to work - that's done in make_bfloat()
+ */
+ if (likely(f->exponent != 127))
+ n = j * 2 + (((unsigned)
+ (f->mantissa -
+ bfloat_mantissa(search, f))) >> 31);
+ else
+ n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+ ? j * 2
+ : j * 2 + 1;
+ } while (n < t->size);
+
+ inorder = to_inorder(j, t);
+
+ /*
+ * n would have been the node we recursed to - the low bit tells us if
+ * we recursed left or recursed right.
+ */
+ if (n & 1) {
+ l = cacheline_to_bkey(t, inorder, f->m);
+
+ if (++inorder != t->size) {
+ f = &t->tree[inorder_next(j, t->size)];
+ r = cacheline_to_bkey(t, inorder, f->m);
+ } else
+ r = end(t->data);
+ } else {
+ r = cacheline_to_bkey(t, inorder, f->m);
+
+ if (--inorder) {
+ f = &t->tree[inorder_prev(j, t->size)];
+ l = cacheline_to_bkey(t, inorder, f->m);
+ } else
+ l = t->data->start;
+ }
+
+ return (struct bset_search_iter) {l, r};
+}
+
+__attribute__((optimize(3)))
+struct bkey *__bset_search(struct btree *b, struct bset_tree *t,
+ const struct bkey *search)
+{
+ struct bset_search_iter i;
+
+ /*
+ * First, we search for a cacheline, then lastly we do a linear search
+ * within that cacheline.
+ *
+ * To search for the cacheline, there's three different possibilities:
+ * * The set is too small to have a search tree, so we just do a linear
+ * search over the whole set.
+ * * The set is the one we're currently inserting into; keeping a full
+ * auxiliary search tree up to date would be too expensive, so we
+ * use a much simpler lookup table to do a binary search -
+ * bset_search_write_set().
+ * * Or we use the auxiliary search tree we constructed earlier -
+ * bset_search_tree()
+ */
+
+ if (unlikely(!t->size)) {
+ i.l = t->data->start;
+ i.r = end(t->data);
+ } else if (bset_written(b, t)) {
+ /*
+ * Each node in the auxiliary search tree covers a certain range
+ * of bits, and keys above and below the set it covers might
+ * differ outside those bits - so we have to special case the
+ * start and end - handle that here:
+ */
+
+ if (unlikely(bkey_cmp(search, &t->end) >= 0))
+ return end(t->data);
+
+ if (unlikely(bkey_cmp(search, t->data->start) < 0))
+ return t->data->start;
+
+ i = bset_search_tree(b, t, search);
+ } else
+ i = bset_search_write_set(b, t, search);
+
+#ifdef CONFIG_BCACHE_EDEBUG
+ BUG_ON(bset_written(b, t) &&
+ i.l != t->data->start &&
+ bkey_cmp(tree_to_prev_bkey(t,
+ inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+ search) > 0);
+
+ BUG_ON(i.r != end(t->data) &&
+ bkey_cmp(i.r, search) <= 0);
+#endif
+
+ while (likely(i.l != i.r) &&
+ bkey_cmp(i.l, search) <= 0)
+ i.l = next(i.l);
+
+ return i.l;
+}
+
+/* Btree iterator */
+
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
+{
+ int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+ return c ? c > 0 : l.k < r.k;
+}
+
+static inline bool btree_iter_end(struct btree_iter *iter)
+{
+ return !iter->used;
+}
+
+void btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end)
+{
+ if (k != end)
+ BUG_ON(!heap_add(iter,
+ ((struct btree_iter_set) { k, end }),
+ btree_iter_cmp));
+}
+
+struct bkey *__btree_iter_init(struct btree *b, struct btree_iter *iter,
+ struct bkey *search, struct bset_tree *start)
+{
+ struct bkey *ret = NULL;
+ iter->size = ARRAY_SIZE(iter->data);
+ iter->used = 0;
+
+ for (; start <= &b->sets[b->nsets]; start++) {
+ ret = bset_search(b, start, search);
+ btree_iter_push(iter, ret, end(start->data));
+ }
+
+ return ret;
+}
+
+struct bkey *btree_iter_next(struct btree_iter *iter)
+{
+ struct btree_iter_set unused;
+ struct bkey *ret = NULL;
+
+ if (!btree_iter_end(iter)) {
+ ret = iter->data->k;
+ iter->data->k = next(iter->data->k);
+
+ if (iter->data->k > iter->data->end) {
+ __WARN();
+ iter->data->k = iter->data->end;
+ }
+
+ if (iter->data->k == iter->data->end)
+ heap_pop(iter, unused, btree_iter_cmp);
+ else
+ heap_sift(iter, 0, btree_iter_cmp);
+ }
+
+ return ret;
+}
+
+struct bkey *next_recurse_key(struct btree *b, struct bkey *search)
+{
+ struct bkey *ret;
+ struct btree_iter iter;
+ btree_iter_init(b, &iter, search);
+
+ do
+ ret = btree_iter_next(&iter);
+ while (ret && ptr_bad(b, ret));
+
+ return ret;
+}
+
+/* Mergesort */
+
+static void btree_sort_fixup(struct btree_iter *iter)
+{
+ while (iter->used > 1) {
+ struct btree_iter_set *top = iter->data, *i = top + 1;
+ struct bkey *k;
+
+ if (iter->used > 2 &&
+ btree_iter_cmp(i[0], i[1]))
+ i++;
+
+ for (k = i->k;
+ k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
+ k = next(k))
+ if (top->k > i->k)
+ __cut_front(top->k, k);
+ else if (KEY_SIZE(k))
+ cut_back(&START_KEY(k), top->k);
+
+ if (top->k < i->k || k == i->k)
+ break;
+
+ heap_sift(iter, i - top, btree_iter_cmp);
+ }
+}
+
+static void btree_mergesort(struct btree *b, struct bset *out,
+ struct btree_iter *iter,
+ bool fixup, bool remove_stale)
+{
+ struct bkey *k, *last = NULL;
+ bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+ ? ptr_bad
+ : ptr_invalid;
+
+ while (!btree_iter_end(iter)) {
+ if (fixup && !b->level)
+ btree_sort_fixup(iter);
+
+ k = btree_iter_next(iter);
+ if (bad(b, k))
+ continue;
+
+ if (!last) {
+ last = out->start;
+ bkey_copy(last, k);
+ } else if (b->level ||
+ !bkey_try_merge(b, last, k)) {
+ last = next(last);
+ bkey_copy(last, k);
+ }
+ }
+
+ out->keys = last ? (uint64_t *) next(last) - out->d : 0;
+
+ pr_debug("sorted %i keys", out->keys);
+ check_key_order(b, out);
+}
+
+static void __btree_sort(struct btree *b, struct btree_iter *iter,
+ unsigned start, unsigned order, bool fixup)
+{
+ uint64_t start_time;
+ bool remove_stale = !b->written;
+ struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+ order);
+ if (!out) {
+ mutex_lock(&b->c->sort_lock);
+ out = b->c->sort;
+ order = ilog2(bucket_pages(b->c));
+ }
+
+ start_time = local_clock();
+
+ btree_mergesort(b, out, iter, fixup, remove_stale);
+ b->nsets = start;
+
+ if (!fixup && !start && b->written)
+ btree_verify(b, out);
+
+ if (!start && order == b->page_order) {
+ out->magic = bset_magic(b->c);
+ out->seq = b->sets[0].data->seq;
+ out->version = b->sets[0].data->version;
+ swap(out, b->sets[0].data);
+
+ if (b->c->sort == b->sets[0].data)
+ b->c->sort = out;
+ } else {
+ b->sets[start].data->keys = out->keys;
+ memcpy(b->sets[start].data->start, out->start,
+ (void *) end(out) - (void *) out->start);
+ }
+
+ if (out == b->c->sort)
+ mutex_unlock(&b->c->sort_lock);
+ else
+ free_pages((unsigned long) out, order);
+
+ if (b->written)
+ bset_build_written_tree(b);
+
+ if (!start) {
+ spin_lock(&b->c->sort_time_lock);
+ time_stats_update(&b->c->sort_time, start_time);
+ spin_unlock(&b->c->sort_time_lock);
+ }
+}
+
+void btree_sort_partial(struct btree *b, unsigned start)
+{
+ size_t oldsize = 0, order = b->page_order, keys = 0;
+ struct btree_iter iter;
+ __btree_iter_init(b, &iter, NULL, &b->sets[start]);
+
+ BUG_ON(b->sets[b->nsets].data == write_block(b) &&
+ (b->sets[b->nsets].size || b->nsets));
+
+ if (b->written)
+ oldsize = count_data(b);
+
+ if (start) {
+ struct bset *i;
+ for_each_sorted_set_start(b, i, start)
+ keys += i->keys;
+
+ order = roundup_pow_of_two(__set_bytes(i, keys)) / PAGE_SIZE;
+ if (order)
+ order = ilog2(order);
+ }
+
+ __btree_sort(b, &iter, start, order, false);
+
+ EBUG_ON(b->written && count_data(b) != oldsize);
+}
+
+void btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+{
+ BUG_ON(!b->written);
+ __btree_sort(b, iter, 0, b->page_order, true);
+}
+
+void btree_sort_into(struct btree *b, struct btree *new)
+{
+ uint64_t start_time = local_clock();
+
+ struct btree_iter iter;
+ btree_iter_init(b, &iter, NULL);
+
+ btree_mergesort(b, new->sets->data, &iter, false, true);
+
+ spin_lock(&b->c->sort_time_lock);
+ time_stats_update(&b->c->sort_time, start_time);
+ spin_unlock(&b->c->sort_time_lock);
+
+ bkey_copy_key(&new->key, &b->key);
+ new->sets->size = 0;
+}
+
+void btree_sort_lazy(struct btree *b)
+{
+ if (b->nsets) {
+ struct bset *i;
+ unsigned keys = 0, total;
+
+ for_each_sorted_set(b, i)
+ keys += i->keys;
+ total = keys;
+
+ for (unsigned j = 0; j < b->nsets; j++) {
+ if (keys * 2 < total ||
+ keys < 1000) {
+ btree_sort_partial(b, j);
+ return;
+ }
+
+ keys -= b->sets[j].data->keys;
+ }
+
+ /* Must sort if b->nsets == 3 or we'll overflow */
+ if (b->nsets >= (MAX_BSETS - 1) - b->level) {
+ btree_sort(b);
+ return;
+ }
+ }
+
+ bset_build_written_tree(b);
+}
+
+/* Sysfs stuff */
+
+struct bset_stats {
+ size_t writes, sets, keys, trees, floats, failed, tree_space;
+};
+
+static int btree_bset_stats(struct btree *b, struct btree_op *op,
+ struct bset_stats *stats)
+{
+ struct bkey *k;
+
+ if (btree_node_dirty(b))
+ stats->writes++;
+ stats->sets += b->nsets + 1;
+ stats->tree_space += bset_tree_space(b);
+
+ for (int i = 0; i < MAX_BSETS && b->sets[i].size; i++) {
+ stats->trees++;
+ stats->keys += b->sets[i].data->keys * sizeof(uint64_t);
+ stats->floats += b->sets[i].size - 1;
+
+ for (size_t j = 1; j < b->sets[i].size; j++)
+ if (b->sets[i].tree[j].exponent == 127)
+ stats->failed++;
+ }
+
+ if (b->level)
+ for_each_key_filter(b, k, ptr_bad) {
+ int ret = btree(bset_stats, k, b, op, stats);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bset_print_stats(struct cache_set *c, char *buf)
+{
+ struct btree_op op;
+ struct bset_stats t;
+
+ btree_op_init_stack(&op);
+ memset(&t, 0, sizeof(struct bset_stats));
+
+ btree_root(bset_stats, c, &op, &t);
+
+ return snprintf(buf, PAGE_SIZE,
+ "sets: %zu\n"
+ "write sets: %zu\n"
+ "key bytes: %zu\n"
+ "trees: %zu\n"
+ "tree space: %zu\n"
+ "floats: %zu\n"
+ "bytes/float: %zu\n"
+ "failed: %zu\n",
+ t.sets, t.writes, t.keys, t.trees, t.tree_space,
+ t.floats, DIV_SAFE(t.keys, t.floats), t.failed);
+}
diff --git a/drivers/block/bcache/bset.h b/drivers/block/bcache/bset.h
new file mode 100644
index 0000000..47c959a
--- /dev/null
+++ b/drivers/block/bcache/bset.h
@@ -0,0 +1,218 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+/* Btree key comparison/iteration */
+
+struct btree_iter {
+ size_t size, used;
+ struct btree_iter_set {
+ struct bkey *k, *end;
+ } data[MAX_BSETS];
+};
+
+struct bset_tree {
+ /*
+ * We construct a binary tree in an array as if the array
+ * started at 1, so that things line up on the same cachelines
+ * better: see comments in bset.c at cacheline_to_bkey() for
+ * details
+ */
+
+ /* size of the binary tree and prev array */
+ unsigned size;
+
+ /* function of size - precalculated for to_inorder() */
+ unsigned extra;
+
+ /* copy of the last key in the set */
+ struct bkey end;
+ struct bkey_float *tree;
+
+ /*
+ * The nodes in the bset tree point to specific keys - this
+ * array holds the sizes of the previous key.
+ *
+ * Conceptually it's a member of struct bkey_float, but we want
+ * to keep bkey_float to 4 bytes and prev isn't used in the fast
+ * path.
+ */
+ uint8_t *prev;
+
+ /* The actual btree node, with pointers to each sorted set */
+ struct bset *data;
+};
+
+static __always_inline int64_t bkey_cmp(const struct bkey *l,
+ const struct bkey *r)
+{
+ return unlikely(KEY_DEV(l) != KEY_DEV(r))
+ ? (int64_t) KEY_DEV(l) - (int64_t) KEY_DEV(r)
+ : (int64_t) l->key - (int64_t) r->key;
+}
+
+static inline size_t bkey_u64s(const struct bkey *k)
+{
+ BUG_ON(KEY_CSUM(k) > 1);
+ return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
+}
+
+static inline size_t bkey_bytes(const struct bkey *k)
+{
+ return bkey_u64s(k) * sizeof(uint64_t);
+}
+
+static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
+{
+ memcpy(dest, src, bkey_bytes(src));
+}
+
+static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
+{
+ if (!src)
+ src = &KEY(0, 0, 0);
+
+ SET_KEY_DEV(dest, KEY_DEV(src));
+ dest->key = src->key;
+}
+
+static inline struct bkey *next(const struct bkey *k)
+{
+ uint64_t *d = (void *) k;
+ return (struct bkey *) (d + bkey_u64s(k));
+}
+
+/* Keylists */
+
+struct keylist {
+ struct bkey *top;
+ union {
+ uint64_t *list;
+ struct bkey *bottom;
+ };
+
+ /* Enough room for btree_split's keys without realloc */
+#define KEYLIST_INLINE 16
+ uint64_t d[KEYLIST_INLINE];
+};
+
+static inline void keylist_init(struct keylist *l)
+{
+ l->top = (void *) (l->list = l->d);
+}
+
+static inline void keylist_push(struct keylist *l)
+{
+ l->top = next(l->top);
+}
+
+static inline void keylist_add(struct keylist *l, struct bkey *k)
+{
+ bkey_copy(l->top, k);
+ keylist_push(l);
+}
+
+static inline bool keylist_empty(struct keylist *l)
+{
+ return l->top == (void *) l->list;
+}
+
+static inline void keylist_free(struct keylist *l)
+{
+ if (l->list != l->d)
+ kfree(l->list);
+}
+
+void keylist_copy(struct keylist *, struct keylist *);
+struct bkey *keylist_pop(struct keylist *);
+int keylist_realloc(struct keylist *, int, struct cache_set *);
+
+void bkey_copy_single_ptr(struct bkey *, const struct bkey *, unsigned);
+bool __cut_front(const struct bkey *, struct bkey *);
+bool __cut_back(const struct bkey *, struct bkey *);
+
+static inline bool cut_front(const struct bkey *where, struct bkey *k)
+{
+ BUG_ON(bkey_cmp(where, k) > 0);
+ return __cut_front(where, k);
+}
+
+static inline bool cut_back(const struct bkey *where, struct bkey *k)
+{
+ BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+ return __cut_back(where, k);
+}
+
+const char *ptr_status(struct cache_set *, const struct bkey *);
+bool __ptr_invalid(struct cache_set *, int level, const struct bkey *);
+bool ptr_invalid(struct btree *, const struct bkey *);
+bool ptr_bad(struct btree *, const struct bkey *);
+
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+ uint8_t r = a - b;
+ return r > 128U ? 0 : r;
+}
+
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+ unsigned i)
+{
+ return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
+}
+
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+ unsigned i)
+{
+ return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+struct bkey *next_recurse_key(struct btree *, struct bkey *);
+struct bkey *btree_iter_next(struct btree_iter *);
+void btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *__btree_iter_init(struct btree *, struct btree_iter *,
+ struct bkey *, struct bset_tree *);
+
+#define btree_iter_init(b, iter, search) \
+ __btree_iter_init(b, iter, search, (b)->sets)
+
+#define BKEY_MID_BITS 3
+#define BKEY_MID_MAX (~(~0 << (BKEY_MID_BITS - 1)))
+#define BKEY_MID_MIN (-1 - BKEY_MID_MAX)
+
+#define BKEY_EXPONENT_BITS 7
+#define BKEY_MANTISSA_BITS 22
+#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+ unsigned exponent:BKEY_EXPONENT_BITS;
+ unsigned m:BKEY_MID_BITS;
+ unsigned mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+#define BSET_CACHELINE 128
+#define BSET_CACHELINE_BITS ilog2(BSET_CACHELINE)
+
+#define bset_tree_space(b) (btree_data_space(b) >> BSET_CACHELINE_BITS)
+
+#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float))
+#define bset_prev_bytes(b) (bset_tree_bytes(b) >> 2)
+
+void bset_init_next(struct btree *);
+
+void bset_fix_invalidated_key(struct btree *, struct bkey *);
+void bset_fix_lookup_table(struct btree *, struct bkey *);
+
+struct bkey *__bset_search(struct btree *, struct bset_tree *,
+ const struct bkey *);
+#define bset_search(b, t, search) \
+ ((search) ? __bset_search(b, t, search) : (t)->data->start)
+
+bool bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
+void btree_sort_lazy(struct btree *);
+void btree_sort_into(struct btree *, struct btree *);
+void btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
+void btree_sort_partial(struct btree *, unsigned);
+#define btree_sort(b) btree_sort_partial(b, 0)
+
+int bset_print_stats(struct cache_set *, char *);
+
+#endif
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 13/16] bcache: Journalling
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (7 preceding siblings ...)
2012-05-10 3:11 ` [Bcache v13 12/16] bcache: Bset code (lookups within a btree node) Kent Overstreet
@ 2012-05-10 3:11 ` Kent Overstreet
2012-05-10 3:11 ` [Bcache v13 14/16] bcache: Request, io and allocation code Kent Overstreet
` (4 subsequent siblings)
13 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:11 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/journal.c | 722 ++++++++++++++++++++++++++++++++++++++++
drivers/block/bcache/journal.h | 113 +++++++
2 files changed, 835 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/journal.c
create mode 100644 drivers/block/bcache/journal.h
diff --git a/drivers/block/bcache/journal.c b/drivers/block/bcache/journal.c
new file mode 100644
index 0000000..2e43ab6
--- /dev/null
+++ b/drivers/block/bcache/journal.c
@@ -0,0 +1,722 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+static void journal_read_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ bio_put(bio);
+ closure_put(cl);
+}
+
+static int journal_read_bucket(struct cache *ca, struct list_head *list,
+ struct btree_op *op, unsigned bucket_index)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio;
+
+ struct journal_replay *i;
+ struct jset *j, *data = ca->set->journal.w[0].data;
+ unsigned len, left, offset = 0;
+ int ret = 0;
+ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+ pr_debug("reading %llu", (uint64_t) bucket);
+
+ while (offset < ca->sb.bucket_size) {
+reread: left = ca->sb.bucket_size - offset;
+ len = min_t(unsigned, left, PAGE_SECTORS * 8);
+
+ bio_reset(bio);
+ bio->bi_sector = bucket + offset;
+ bio->bi_bdev = ca->bdev;
+ bio->bi_rw = READ;
+ bio->bi_size = len << 9;
+
+ bio->bi_end_io = journal_read_endio;
+ bio->bi_private = &op->cl;
+ bio_map(bio, data);
+
+ closure_bio_submit(bio, &op->cl, ca->set->bio_split);
+ closure_sync(&op->cl);
+
+ /* This function could be simpler now since we no longer write
+ * journal entries that overlap bucket boundaries; this means
+ * the start of a bucket will always have a valid journal entry
+ * if it has any journal entries at all.
+ */
+
+ j = data;
+ while (len) {
+ struct list_head *where;
+ size_t blocks, bytes = set_bytes(j);
+
+ if (j->magic != jset_magic(ca->set))
+ return ret;
+
+ if (bytes > left << 9)
+ return ret;
+
+ if (bytes > len << 9)
+ goto reread;
+
+ if (j->csum != csum_set(j))
+ return ret;
+
+ blocks = set_blocks(j, ca->set);
+
+ while (!list_empty(list)) {
+ i = list_first_entry(list,
+ struct journal_replay, list);
+ if (i->j.seq >= j->last_seq)
+ break;
+ list_del(&i->list);
+ kfree(i);
+ }
+
+ list_for_each_entry_reverse(i, list, list) {
+ if (j->seq == i->j.seq)
+ goto next_set;
+
+ if (j->seq < i->j.last_seq)
+ goto next_set;
+
+ if (j->seq > i->j.seq) {
+ where = &i->list;
+ goto add;
+ }
+ }
+
+ where = list;
+add:
+ i = kmalloc(offsetof(struct journal_replay, j) +
+ bytes, GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
+ memcpy(&i->j, j, bytes);
+ list_add(&i->list, where);
+ ret = 1;
+
+ ja->seq[bucket_index] = j->seq;
+next_set:
+ offset += blocks * ca->sb.block_size;
+ len -= blocks * ca->sb.block_size;
+ j = ((void *) j) + blocks * block_bytes(ca);
+ }
+ }
+
+ return ret;
+}
+
+int bcache_journal_read(struct cache_set *c, struct list_head *list,
+ struct btree_op *op)
+{
+#define read_bucket(b) \
+ ({ \
+ int ret = journal_read_bucket(ca, list, op, b); \
+ __set_bit(b, bitmap); \
+ if (ret < 0) \
+ return ret; \
+ ret; \
+ })
+
+ struct cache *ca;
+
+ for_each_cache(ca, c) {
+ struct journal_device *ja = &ca->journal;
+ unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+ unsigned l, r, m;
+ uint64_t seq;
+
+ bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+ pr_debug("%u journal buckets", ca->sb.njournal_buckets);
+
+ /* Read journal buckets ordered by golden ratio hash to quickly
+ * find a sequence of buckets with valid journal entries
+ */
+ for (unsigned i = 0; i < ca->sb.njournal_buckets; i++) {
+ l = (i * 2654435769U) % ca->sb.njournal_buckets;
+
+ if (test_bit(l, bitmap))
+ break;
+
+ if (read_bucket(l))
+ goto bsearch;
+ }
+
+ /* If that fails, check all the buckets we haven't checked
+ * already
+ */
+ pr_debug("falling back to linear search");
+
+ for (l = 0; l < ca->sb.njournal_buckets; l++) {
+ if (test_bit(l, bitmap))
+ continue;
+
+ if (read_bucket(l))
+ goto bsearch;
+ }
+bsearch:
+ /* Binary search */
+ m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+ pr_debug("starting binary search, l %u r %u", l, r);
+
+ while (l + 1 < r) {
+ m = (l + r) >> 1;
+
+ if (read_bucket(m))
+ l = m;
+ else
+ r = m;
+ }
+
+ /* Read buckets in reverse order until we stop finding more
+ * journal entries
+ */
+ pr_debug("finishing up");
+ l = m;
+
+ while (1) {
+ if (!l--)
+ l = ca->sb.njournal_buckets - 1;
+
+ if (l == m)
+ break;
+
+ if (test_bit(l, bitmap))
+ continue;
+
+ if (!read_bucket(l))
+ break;
+ }
+
+ seq = 0;
+
+ for (unsigned i = 0; i < ca->sb.njournal_buckets; i++)
+ if (ja->seq[i] > seq) {
+ seq = ja->seq[i];
+ ja->cur = ja->last = i;
+
+ }
+ }
+
+ c->journal.seq = list_entry(list->prev,
+ struct journal_replay,
+ list)->j.seq;
+
+ return 0;
+#undef read_bucket
+}
+
+void bcache_journal_mark(struct cache_set *c, struct list_head *list)
+{
+ atomic_t p = { 0 };
+ struct journal_replay *i;
+ struct journal *j = &c->journal;
+ uint64_t last = j->seq;
+
+ /*
+ * journal.pin should never fill up - we never write a journal
+ * entry when it would fill up. But if for some reason it does, we
+ * iterate over the list in reverse order so that we can just skip that
+ * refcount instead of bugging.
+ */
+
+ list_for_each_entry_reverse(i, list, list) {
+ BUG_ON(last < i->j.seq);
+ i->pin = NULL;
+
+ while (last-- != i->j.seq)
+ if (fifo_free(&j->pin) > 1) {
+ fifo_push_front(&j->pin, p);
+ atomic_set(&fifo_front(&j->pin), 0);
+ }
+
+ if (fifo_free(&j->pin) > 1) {
+ fifo_push_front(&j->pin, p);
+ i->pin = &fifo_front(&j->pin);
+ atomic_set(i->pin, 1);
+ }
+
+ for (struct bkey *k = i->j.start; k < end(&i->j); k = next(k)) {
+ for (unsigned j = 0; j < KEY_PTRS(k); j++) {
+ struct bucket *g = PTR_BUCKET(c, k, j);
+ atomic_inc(&g->pin);
+
+ if (g->prio == btree_prio &&
+ !ptr_stale(c, k, j))
+ g->prio = initial_prio;
+ }
+
+ __btree_mark_key(c, 0, k);
+ }
+ }
+}
+
+int bcache_journal_replay(struct cache_set *s, struct list_head *list,
+ struct btree_op *op)
+{
+ int ret = 0, keys = 0, entries = 0;
+ struct journal_replay *i =
+ list_entry(list->prev, struct journal_replay, list);
+
+ uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+
+ list_for_each_entry(i, list, list) {
+ BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+ if (n != i->j.seq)
+ err_printk("journal entries %llu-%llu "
+ "missing! (replaying %llu-%llu)\n",
+ n, i->j.seq - 1, start, end);
+
+ for (struct bkey *k = i->j.start; k < end(&i->j); k = next(k)) {
+ pr_debug("%s", pkey(k));
+ bkey_copy(op->keys.top, k);
+ keylist_push(&op->keys);
+
+ op->journal = i->pin;
+ atomic_inc(op->journal);
+
+ ret = bcache_btree_insert(op, s);
+ if (ret)
+ goto err;
+
+ BUG_ON(!keylist_empty(&op->keys));
+ keys++;
+ }
+
+ if (i->pin)
+ atomic_dec(i->pin);
+ n = i->j.seq + 1;
+ entries++;
+ }
+
+ printk(KERN_INFO "bcache: journal replay done, %i keys in %i "
+ "entries, seq %llu-%llu\n", keys, entries, start, end);
+
+ while (!list_empty(list)) {
+ i = list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+ kfree(i);
+ }
+err:
+ closure_sync(&op->cl);
+ return ret;
+}
+
+/* Journalling */
+
+static void btree_flush_write(struct cache_set *c)
+{
+ /*
+ * Try to find the btree node with that references the oldest journal
+ * entry, best is our current candidate and is locked if non NULL:
+ */
+ struct btree *b, *best;
+
+ /*
+ * The root of the btree isn't on the lru list. Normally this is fine
+ * because only leaf nodes can have references to journal entries -
+ * unless the root _is_ a leaf node. So we have to special case that:
+ */
+
+ while (!c->root->level) {
+ best = c->root;
+ rw_lock(true, best, 0);
+
+ if (best == c->root && !best->level)
+ goto found;
+ rw_unlock(true, best);
+ }
+
+ mutex_lock(&c->bucket_lock);
+
+ best = NULL;
+ list_for_each_entry(b, &c->btree_cache, list) {
+ if (!down_write_trylock(&b->lock))
+ continue;
+
+ if (!btree_node_dirty(b) ||
+ !btree_current_write(b)->journal) {
+ rw_unlock(true, b);
+ continue;
+ }
+
+ if (!best)
+ best = b;
+ else if (journal_pin_cmp(c,
+ btree_current_write(best),
+ btree_current_write(b))) {
+ rw_unlock(true, best);
+ best = b;
+ } else
+ rw_unlock(true, b);
+ }
+
+ if (best)
+ goto out;
+
+ /* We can't find the best btree node, just pick the first */
+ list_for_each_entry(b, &c->btree_cache, list)
+ if (!b->level && btree_node_dirty(b)) {
+ best = b;
+ mutex_unlock(&c->bucket_lock);
+ rw_lock(true, best, best->level);
+ goto found;
+ }
+
+out:
+ mutex_unlock(&c->bucket_lock);
+
+ if (!best)
+ return;
+found:
+ if (btree_node_dirty(best))
+ btree_write(best, true, NULL);
+ rw_unlock(true, best);
+}
+
+#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
+
+static void journal_reclaim(struct cache_set *c)
+{
+ struct bkey *k = &c->journal.key;
+ struct cache *ca;
+ uint64_t last_seq;
+ unsigned n = 0;
+ atomic_t p;
+
+ while (!atomic_read(&fifo_front(&c->journal.pin)))
+ fifo_pop(&c->journal.pin, p);
+
+ last_seq = last_seq(&c->journal);
+
+ for_each_cache(ca, c) {
+ struct journal_device *ja = &ca->journal;
+
+ while (ja->last != ja->cur &&
+ ja->seq[ja->last] < last_seq)
+ if (++ja->last == ca->sb.njournal_buckets)
+ ja->last = 0;
+ }
+
+ if (c->journal.blocks_free)
+ return;
+
+ /*
+ * Now we allocate:
+ * XXX: Sort by free journal space
+ */
+
+ for_each_cache(ca, c) {
+ struct journal_device *ja = &ca->journal;
+ unsigned next = (ja->cur + 1) % ca->sb.njournal_buckets;
+
+ if (next == ja->last)
+ continue;
+
+ ja->cur = next;
+ k->ptr[n++] = PTR(0,
+ bucket_to_sector(c, ca->sb.d[ja->cur]),
+ ca->sb.nr_this_dev);
+ }
+
+ k->header = KEY_HEADER(0, 0);
+ SET_KEY_PTRS(k, n);
+
+ if (n)
+ c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+
+ if (!journal_full(&c->journal))
+ __closure_wake_up(&c->journal.wait);
+}
+
+void bcache_journal_next(struct journal *j)
+{
+ atomic_t p = { 1 };
+
+ j->cur = (j->cur == j->w)
+ ? &j->w[1]
+ : &j->w[0];
+
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for last_seq() to be calculated correctly
+ */
+ BUG_ON(!fifo_push(&j->pin, p));
+ atomic_set(&fifo_back(&j->pin), 1);
+
+ j->cur->data->seq = ++j->seq;
+ j->cur->need_write = false;
+ j->cur->data->keys = 0;
+
+ if (fifo_full(&j->pin))
+ pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+}
+
+static void journal_write_endio(struct bio *bio, int error)
+{
+ struct journal_write *w = bio->bi_private;
+
+ cache_set_err_on(error, w->c, "journal io error");
+
+ bio_put(bio);
+ closure_put(&w->c->journal.io.cl);
+}
+
+static void journal_write(struct closure *);
+
+static void journal_write_done(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io.cl);
+ struct cache_set *c = container_of(j, struct cache_set, journal);
+
+ struct journal_write *w = (j->cur == j->w)
+ ? &j->w[1]
+ : &j->w[0];
+
+ __closure_wake_up(&w->wait);
+
+ if (c->journal_delay_ms)
+ closure_sleep(&j->io, msecs_to_jiffies(c->journal_delay_ms));
+
+ continue_at(cl, journal_write, system_wq);
+}
+
+static void journal_write_unlocked(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+ struct cache *ca;
+ struct journal_write *w = c->journal.cur;
+ struct bkey *k = &c->journal.key;
+ unsigned sectors = set_blocks(w->data, c) * c->sb.block_size;
+
+ struct bio *bio;
+ struct bio_list list;
+ bio_list_init(&list);
+
+ if (!w->need_write) {
+ /*
+ * XXX: have to unlock closure before we unlock journal lock,
+ * else we race with bcache_journal(). But this way we race
+ * against cache set unregister. Doh.
+ */
+ set_closure_fn(cl, NULL, NULL);
+ closure_sub(cl, CLOSURE_RUNNING + 1);
+ spin_unlock(&c->journal.lock);
+ return;
+ } else if (journal_full(&c->journal)) {
+ journal_reclaim(c);
+ spin_unlock(&c->journal.lock);
+
+ btree_flush_write(c);
+ continue_at(cl, journal_write, system_wq);
+ }
+
+ c->journal.blocks_free -= set_blocks(w->data, c);
+
+ w->data->btree_level = c->root->level;
+
+ bkey_copy(&w->data->btree_root, &c->root->key);
+ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+
+ for_each_cache(ca, c)
+ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+
+ w->data->magic = jset_magic(c);
+ w->data->version = BCACHE_JSET_VERSION;
+ w->data->last_seq = last_seq(&c->journal);
+ w->data->csum = csum_set(w->data);
+
+ for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+ ca = PTR_CACHE(c, k, i);
+ bio = &ca->journal.bio;
+
+ atomic_long_add(sectors, &ca->meta_sectors_written);
+
+ bio_reset(bio);
+ bio->bi_sector = PTR_OFFSET(k, i);
+ bio->bi_bdev = ca->bdev;
+ bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+ bio->bi_size = sectors << 9;
+
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = w;
+ bio_map(bio, w->data);
+
+ trace_bcache_journal_write(bio);
+ bio_list_add(&list, bio);
+
+ SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
+
+ ca->journal.seq[ca->journal.cur] = w->data->seq;
+ }
+
+ atomic_dec_bug(&fifo_back(&c->journal.pin));
+ bcache_journal_next(&c->journal);
+ journal_reclaim(c);
+
+ spin_unlock(&c->journal.lock);
+
+ while ((bio = bio_list_pop(&list)))
+ closure_bio_submit(bio, cl, c->bio_split);
+
+ continue_at(cl, journal_write_done, NULL);
+}
+
+static void journal_write(struct closure *cl)
+{
+ struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+
+ spin_lock(&c->journal.lock);
+ journal_write_unlocked(cl);
+}
+
+static void __journal_try_write(struct cache_set *c, bool noflush)
+{
+ struct closure *cl = &c->journal.io.cl;
+
+ if (!closure_trylock(cl, &c->cl))
+ spin_unlock(&c->journal.lock);
+ else if (noflush && journal_full(&c->journal)) {
+ spin_unlock(&c->journal.lock);
+ continue_at(cl, journal_write, system_wq);
+ } else
+ journal_write_unlocked(cl);
+}
+
+#define journal_try_write(c) __journal_try_write(c, false)
+
+void bcache_journal_meta(struct cache_set *c, struct closure *cl)
+{
+ struct journal_write *w;
+
+ if (CACHE_SYNC(&c->sb)) {
+ spin_lock(&c->journal.lock);
+
+ w = c->journal.cur;
+ w->need_write = true;
+
+ if (cl)
+ BUG_ON(!closure_wait(&w->wait, cl));
+
+ __journal_try_write(c, true);
+ }
+}
+
+/*
+ * Entry point to the journalling code - bio_insert() and btree_invalidate()
+ * pass bcache_journal() a list of keys to be journalled, and then
+ * bcache_journal() hands those same keys off to btree_insert_async()
+ */
+
+void bcache_journal(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct cache_set *c = op->d->c;
+ struct journal_write *w;
+ size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
+
+ if (op->type != BTREE_INSERT ||
+ !CACHE_SYNC(&c->sb))
+ goto out;
+
+ /*
+ * If we're looping because we errored, might already be waiting on
+ * another journal write:
+ */
+ while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
+ closure_sync(cl->parent);
+
+ spin_lock(&c->journal.lock);
+
+ if (journal_full(&c->journal)) {
+ /* XXX: tracepoint */
+ closure_wait(&c->journal.wait, cl);
+
+ journal_reclaim(c);
+ spin_unlock(&c->journal.lock);
+
+ btree_flush_write(c);
+ continue_at(cl, bcache_journal, bcache_wq);
+ }
+
+ w = c->journal.cur;
+ w->need_write = true;
+ b = __set_blocks(w->data, w->data->keys + n, c);
+
+ if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
+ b > c->journal.blocks_free) {
+ /* XXX: If we were inserting so many keys that they won't fit in
+ * an _empty_ journal write, we'll deadlock. For now, handle
+ * this in keylist_realloc() - but something to think about.
+ */
+ BUG_ON(!w->data->keys);
+
+ /* XXX: tracepoint */
+ BUG_ON(!closure_wait(&w->wait, cl));
+
+ closure_flush(&c->journal.io);
+
+ journal_try_write(c);
+ continue_at(cl, bcache_journal, bcache_wq);
+ }
+
+ memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
+ w->data->keys += n;
+
+ op->journal = &fifo_back(&c->journal.pin);
+ atomic_inc(op->journal);
+
+ if (op->flush_journal) {
+ closure_flush(&c->journal.io);
+ closure_wait(&w->wait, cl->parent);
+ }
+
+ journal_try_write(c);
+out:
+ bcache_btree_insert_async(cl);
+}
+
+void bcache_journal_free(struct cache_set *c)
+{
+ free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
+ free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
+ free_fifo(&c->journal.pin);
+}
+
+int bcache_journal_alloc(struct cache_set *c)
+{
+ struct journal *j = &c->journal;
+
+ closure_init_unlocked(&j->io);
+ spin_lock_init(&j->lock);
+
+ c->journal_delay_ms = 100;
+
+ j->w[0].c = c;
+ j->w[1].c = c;
+
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/block/bcache/journal.h b/drivers/block/bcache/journal.h
new file mode 100644
index 0000000..eb829e9
--- /dev/null
+++ b/drivers/block/bcache/journal.h
@@ -0,0 +1,113 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+#define BCACHE_JSET_VERSION_UUIDv1 1
+/* Always latest UUID format */
+#define BCACHE_JSET_VERSION_UUID 1
+#define BCACHE_JSET_VERSION 1
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+ uint64_t csum;
+ uint64_t magic;
+ uint64_t seq;
+ uint32_t version;
+ uint32_t keys;
+
+ uint64_t last_seq;
+
+ BKEY_PADDED(uuid_bucket);
+ BKEY_PADDED(btree_root);
+ uint16_t btree_level;
+ uint16_t pad[3];
+
+ uint64_t prio_bucket[MAX_CACHES_PER_SET];
+
+ union {
+ struct bkey start[0];
+ uint64_t d[0];
+ };
+};
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ struct list_head list;
+ atomic_t *pin;
+ struct jset j;
+};
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_write {
+ struct jset *data;
+#define JSET_BITS 3
+
+ struct cache_set *c;
+ closure_list_t wait;
+ bool need_write;
+};
+
+struct journal {
+ spinlock_t lock;
+ /* used when waiting because the journal was full */
+ closure_list_t wait;
+ struct closure_with_timer io;
+
+ unsigned blocks_free;
+ uint64_t seq;
+ DECLARE_FIFO(atomic_t, pin);
+
+ BKEY_PADDED(key);
+
+ struct journal_write w[2], *cur;
+};
+
+struct journal_device {
+ unsigned cur;
+ unsigned last;
+ uint64_t seq[SB_JOURNAL_BUCKETS];
+
+ struct bio bio;
+ struct bio_vec bv[8];
+};
+
+#define journal_pin_cmp(c, l, r) \
+ (fifo_idx(&(c)->journal.pin, (l)->journal) > \
+ fifo_idx(&(c)->journal.pin, (r)->journal))
+
+#define JOURNAL_PIN 20000
+
+#define journal_full(j) \
+ (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+
+struct closure;
+struct cache_set;
+struct btree_op;
+
+void bcache_journal(struct closure *);
+void bcache_journal_next(struct journal *);
+void bcache_journal_mark(struct cache_set *, struct list_head *);
+void bcache_journal_meta(struct cache_set *, struct closure *);
+int bcache_journal_read(struct cache_set *, struct list_head *,
+ struct btree_op *);
+int bcache_journal_replay(struct cache_set *, struct list_head *,
+ struct btree_op *);
+
+void bcache_journal_free(struct cache_set *);
+int bcache_journal_alloc(struct cache_set *);
+
+#endif /* _BCACHE_JOURNAL_H */
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 14/16] bcache: Request, io and allocation code
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (8 preceding siblings ...)
2012-05-10 3:11 ` [Bcache v13 13/16] bcache: Journalling Kent Overstreet
@ 2012-05-10 3:11 ` Kent Overstreet
[not found] ` <9ea33658f2a71b3b9bd2ec10bee959bef146f23c.1336619038.git.koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2012-05-10 3:11 ` [Bcache v13 15/16] bcache: Writeback Kent Overstreet
` (3 subsequent siblings)
13 siblings, 1 reply; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:11 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/alloc.c | 591 ++++++++++++++++
drivers/block/bcache/io.c | 198 ++++++
drivers/block/bcache/request.c | 1470 ++++++++++++++++++++++++++++++++++++++++
drivers/block/bcache/request.h | 58 ++
4 files changed, 2317 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/alloc.c
create mode 100644 drivers/block/bcache/io.c
create mode 100644 drivers/block/bcache/request.c
create mode 100644 drivers/block/bcache/request.h
diff --git a/drivers/block/bcache/alloc.c b/drivers/block/bcache/alloc.c
new file mode 100644
index 0000000..b55392f
--- /dev/null
+++ b/drivers/block/bcache/alloc.c
@@ -0,0 +1,591 @@
+
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/random.h>
+
+/*
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write() - and when prio_write() eventually finishes it toggles
+ * c->prio_written and the buckets in free_inc are now ready to be used. When
+ * the free_inc list empties, we toggle c->prio_written and the cycle repeats.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * There is another freelist, because sometimes we have buckets that we know
+ * have nothing pointing into them - these we can reuse without waiting for
+ * priorities to be rewritten. These come from freed btree nodes and buckets
+ * that garbage collection discovered no longer had valid keys pointing into
+ * them (because they were overwritten). That's the unused list - buckets on the
+ * unused list move to the free list, optionally being discarded in the process.
+ *
+ * It's also important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * pop_bucket() allocates a single bucket from a specific cache.
+ *
+ * pop_bucket_set() allocates one or more buckets from different caches out of a
+ * cache set.
+ *
+ * free_some_buckets() drives all the processes described above. It's called
+ * from pop_bucket() and a few other places that need to make sure free buckets
+ * are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+static void do_discard(struct cache *);
+
+/* Bucket heap / gen */
+
+uint8_t inc_gen(struct cache *c, struct bucket *b)
+{
+ uint8_t ret = ++b->gen;
+
+ c->set->need_gc = max(c->set->need_gc, bucket_gc_gen(b));
+ BUG_ON(c->set->need_gc > 97);
+
+ if (CACHE_SYNC(&c->set->sb)) {
+ c->need_save_prio = max(c->need_save_prio, bucket_disk_gen(b));
+ BUG_ON(c->need_save_prio > 96);
+ }
+
+ return ret;
+}
+
+void rescale_priorities(struct cache_set *c, int sectors)
+{
+ struct cache *ca;
+ struct bucket *b;
+ unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
+ int r;
+
+ atomic_sub(sectors, &c->rescale);
+
+ do {
+ r = atomic_read(&c->rescale);
+
+ if (r >= 0)
+ return;
+ } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
+
+ mutex_lock(&c->bucket_lock);
+
+ for_each_cache(ca, c)
+ for_each_bucket(b, ca)
+ if (b->prio &&
+ b->prio != btree_prio &&
+ !atomic_read(&b->pin)) {
+ b->prio--;
+ c->min_prio = min(c->min_prio, b->prio);
+ }
+
+ mutex_unlock(&c->bucket_lock);
+}
+
+static long pop_freed(struct cache *c)
+{
+ long r;
+
+ if ((!CACHE_SYNC(&c->set->sb) ||
+ !atomic_read(&c->set->prio_blocked)) &&
+ fifo_pop(&c->unused, r))
+ return r;
+
+ if ((!CACHE_SYNC(&c->set->sb) ||
+ atomic_read(&c->prio_written) > 0) &&
+ fifo_pop(&c->free_inc, r))
+ return r;
+
+ return -1;
+}
+
+/* Discard/TRIM */
+
+struct discard {
+ struct list_head list;
+ struct work_struct work;
+ struct cache *c;
+ long bucket;
+
+ struct bio bio;
+ struct bio_vec bv;
+};
+
+static void discard_finish(struct work_struct *w)
+{
+ struct discard *d = container_of(w, struct discard, work);
+ struct cache *c = d->c;
+ char buf[BDEVNAME_SIZE];
+ bool run = false;
+
+ if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
+ printk(KERN_NOTICE "bcache: discard error on %s, disabling\n",
+ bdevname(c->bdev, buf));
+ d->c->discard = 0;
+ }
+
+ mutex_lock(&c->set->bucket_lock);
+ if (fifo_empty(&c->free) ||
+ fifo_used(&c->free) == 8)
+ run = true;
+
+ fifo_push(&c->free, d->bucket);
+
+ list_add(&d->list, &c->discards);
+
+ do_discard(c);
+ mutex_unlock(&c->set->bucket_lock);
+
+ if (run)
+ closure_wake_up(&c->set->bucket_wait);
+
+ closure_put(&c->set->cl);
+}
+
+static void discard_endio(struct bio *bio, int error)
+{
+ struct discard *d = container_of(bio, struct discard, bio);
+
+ PREPARE_WORK(&d->work, discard_finish);
+ schedule_work(&d->work);
+}
+
+static void discard_work(struct work_struct *w)
+{
+ struct discard *d = container_of(w, struct discard, work);
+ submit_bio(0, &d->bio);
+}
+
+static void do_discard(struct cache *c)
+{
+ struct request_queue *q = bdev_get_queue(c->bdev);
+ int s = q->limits.logical_block_size;
+
+ while (c->discard &&
+ !atomic_read(&c->set->closing) &&
+ !list_empty(&c->discards) &&
+ fifo_free(&c->free) >= 8) {
+ struct discard *d = list_first_entry(&c->discards,
+ struct discard, list);
+
+ d->bucket = pop_freed(c);
+ if (d->bucket == -1)
+ break;
+
+ list_del(&d->list);
+ closure_get(&c->set->cl);
+
+ bio_init(&d->bio);
+ memset(&d->bv, 0, sizeof(struct bio_vec));
+ bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+ d->bio.bi_sector = bucket_to_sector(c->set, d->bucket);
+ d->bio.bi_bdev = c->bdev;
+ d->bio.bi_rw = REQ_WRITE|(1 << BIO_RW_DISCARD);
+ d->bio.bi_max_vecs = 1;
+ d->bio.bi_io_vec = d->bio.bi_inline_vecs;
+ d->bio.bi_end_io = discard_endio;
+
+ if (bio_add_pc_page(q, &d->bio, c->discard_page, s, 0) < s) {
+ printk(KERN_DEBUG "bcache: bio_add_pc_page failed\n");
+ c->discard = 0;
+ fifo_push(&c->free, d->bucket);
+ list_add(&d->list, &c->discards);
+ break;
+ }
+
+ d->bio.bi_size = bucket_bytes(c);
+
+ schedule_work(&d->work);
+ }
+}
+
+void free_discards(struct cache *ca)
+{
+ struct discard *d;
+
+ while (!list_empty(&ca->discards)) {
+ d = list_first_entry(&ca->discards, struct discard, list);
+ cancel_work_sync(&d->work);
+ list_del(&d->list);
+ kfree(d);
+ }
+}
+
+int alloc_discards(struct cache *ca)
+{
+ for (int i = 0; i < 8; i++) {
+ struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->c = ca;
+ INIT_WORK(&d->work, discard_work);
+ list_add(&d->list, &ca->discards);
+ }
+
+ return 0;
+}
+
+/* Allocation */
+
+bool bucket_add_unused(struct cache *c, struct bucket *b)
+{
+ if (c->prio_alloc == prio_buckets(c) &&
+ CACHE_REPLACEMENT(&c->sb) == CACHE_REPLACEMENT_FIFO)
+ return false;
+
+ b->prio = 0;
+
+ if (bucket_gc_gen(b) < 96U &&
+ bucket_disk_gen(b) < 64U &&
+ fifo_push(&c->unused, b - c->buckets)) {
+ atomic_inc(&b->pin);
+ return true;
+ }
+
+ return false;
+}
+
+static bool can_invalidate_bucket(struct cache *c, struct bucket *b)
+{
+ return b->mark >= 0 &&
+ !atomic_read(&b->pin) &&
+ bucket_gc_gen(b) < 96U &&
+ bucket_disk_gen(b) < 64U;
+}
+
+static void invalidate_one_bucket(struct cache *c, struct bucket *b)
+{
+ inc_gen(c, b);
+ b->prio = initial_prio;
+ atomic_inc(&b->pin);
+ fifo_push(&c->free_inc, b - c->buckets);
+}
+
+static void invalidate_buckets_lru(struct cache *c)
+{
+ unsigned bucket_prio(struct bucket *b)
+ {
+ return ((unsigned) (b->prio - c->set->min_prio)) * b->mark;
+ }
+
+ bool bucket_max_cmp(struct bucket *l, struct bucket *r)
+ {
+ return bucket_prio(l) < bucket_prio(r);
+ }
+
+ bool bucket_min_cmp(struct bucket *l, struct bucket *r)
+ {
+ return bucket_prio(l) > bucket_prio(r);
+ }
+
+ struct bucket *b;
+
+ c->heap.used = 0;
+
+ for_each_bucket(b, c) {
+ if (!can_invalidate_bucket(c, b))
+ continue;
+
+ if (!b->mark) {
+ if (!bucket_add_unused(c, b))
+ return;
+ } else {
+ if (!heap_full(&c->heap))
+ heap_add(&c->heap, b, bucket_max_cmp);
+ else if (bucket_max_cmp(b, heap_peek(&c->heap))) {
+ c->heap.data[0] = b;
+ heap_sift(&c->heap, 0, bucket_max_cmp);
+ }
+ }
+ }
+
+ if (c->heap.used * 2 < c->heap.size)
+ bcache_queue_gc(c->set);
+
+ for (ssize_t i = c->heap.used / 2 - 1; i >= 0; --i)
+ heap_sift(&c->heap, i, bucket_min_cmp);
+
+ while (!fifo_full(&c->free_inc)) {
+ if (!heap_pop(&c->heap, b, bucket_min_cmp)) {
+ /* We don't want to be calling invalidate_buckets()
+ * multiple times when it can't do anything
+ */
+ c->invalidate_needs_gc = 1;
+ bcache_queue_gc(c->set);
+ return;
+ }
+
+ invalidate_one_bucket(c, b);
+ }
+}
+
+static void invalidate_buckets_fifo(struct cache *c)
+{
+ struct bucket *b;
+ size_t checked = 0;
+
+ while (!fifo_full(&c->free_inc)) {
+ if (c->fifo_last_bucket < c->sb.first_bucket ||
+ c->fifo_last_bucket >= c->sb.nbuckets)
+ c->fifo_last_bucket = c->sb.first_bucket;
+
+ b = c->buckets + c->fifo_last_bucket++;
+
+ if (can_invalidate_bucket(c, b))
+ invalidate_one_bucket(c, b);
+
+ if (++checked >= c->sb.nbuckets) {
+ c->invalidate_needs_gc = 1;
+ bcache_queue_gc(c->set);
+ return;
+ }
+ }
+}
+
+static void invalidate_buckets_random(struct cache *c)
+{
+ struct bucket *b;
+ size_t checked = 0;
+
+ while (!fifo_full(&c->free_inc)) {
+ size_t n;
+ get_random_bytes(&n, sizeof(n));
+
+ n %= (size_t) (c->sb.nbuckets - c->sb.first_bucket);
+ n += c->sb.first_bucket;
+
+ b = c->buckets + n;
+
+ if (can_invalidate_bucket(c, b))
+ invalidate_one_bucket(c, b);
+
+ if (++checked >= c->sb.nbuckets / 2) {
+ c->invalidate_needs_gc = 1;
+ bcache_queue_gc(c->set);
+ return;
+ }
+ }
+}
+
+static void invalidate_buckets(struct cache *c)
+{
+ /* free_some_buckets() may just need to write priorities to keep gens
+ * from wrapping around
+ */
+ if (!c->set->gc_mark_valid ||
+ c->invalidate_needs_gc)
+ return;
+
+ switch (CACHE_REPLACEMENT(&c->sb)) {
+ case CACHE_REPLACEMENT_LRU:
+ invalidate_buckets_lru(c);
+ break;
+ case CACHE_REPLACEMENT_FIFO:
+ invalidate_buckets_fifo(c);
+ break;
+ case CACHE_REPLACEMENT_RANDOM:
+ invalidate_buckets_random(c);
+ break;
+ }
+}
+
+bool can_save_prios(struct cache *c)
+{
+ return ((c->need_save_prio > 64 ||
+ (c->set->gc_mark_valid &&
+ !c->invalidate_needs_gc)) &&
+ !atomic_read(&c->prio_written) &&
+ !atomic_read(&c->set->prio_blocked));
+}
+
+void free_some_buckets(struct cache *c)
+{
+ long r;
+
+ /*
+ * XXX: do_discard(), prio_write() take refcounts on the cache set. How
+ * do we know that refcount is nonzero?
+ */
+
+ do_discard(c);
+
+ while (!fifo_full(&c->free) &&
+ (fifo_used(&c->free) <= 8 ||
+ !c->discard) &&
+ (r = pop_freed(c)) != -1)
+ fifo_push(&c->free, r);
+
+ while (c->prio_alloc != prio_buckets(c) &&
+ fifo_pop(&c->free, r)) {
+ struct bucket *b = c->buckets + r;
+ c->prio_next[c->prio_alloc++] = r;
+
+ b->mark = GC_MARK_BTREE;
+ atomic_dec_bug(&b->pin);
+ }
+
+ if (!CACHE_SYNC(&c->set->sb)) {
+ if (fifo_empty(&c->free_inc))
+ invalidate_buckets(c);
+ return;
+ }
+
+ /* XXX: tracepoint for when c->need_save_prio > 64 */
+
+ if (c->need_save_prio <= 64 &&
+ fifo_used(&c->unused) > c->unused.size / 2)
+ return;
+
+ if (atomic_read(&c->prio_written) > 0 &&
+ (fifo_empty(&c->free_inc) ||
+ c->need_save_prio > 64))
+ atomic_set(&c->prio_written, 0);
+
+ if (!can_save_prios(c))
+ return;
+
+ invalidate_buckets(c);
+
+ if (!fifo_empty(&c->free_inc) ||
+ c->need_save_prio > 64)
+ prio_write(c);
+}
+
+static long pop_bucket(struct cache *c, uint16_t priority, struct closure *cl)
+{
+ long r = -1;
+again:
+ free_some_buckets(c);
+
+ if ((priority == btree_prio ||
+ fifo_used(&c->free) > 8) &&
+ fifo_pop(&c->free, r)) {
+ struct bucket *b = c->buckets + r;
+#ifdef CONFIG_BCACHE_EDEBUG
+ long i;
+ for (unsigned j = 0; j < prio_buckets(c); j++)
+ BUG_ON(c->prio_buckets[j] == (uint64_t) r);
+ for (unsigned j = 0; j < c->prio_alloc; j++)
+ BUG_ON(c->prio_next[j] == (uint64_t) r);
+
+ fifo_for_each(i, &c->free)
+ BUG_ON(i == r);
+ fifo_for_each(i, &c->free_inc)
+ BUG_ON(i == r);
+ fifo_for_each(i, &c->unused)
+ BUG_ON(i == r);
+#endif
+ BUG_ON(atomic_read(&b->pin) != 1);
+
+ b->prio = priority;
+ b->mark = priority == btree_prio
+ ? GC_MARK_BTREE
+ : c->sb.bucket_size;
+ return r;
+ }
+
+ pr_debug("no free buckets, prio_written %i, blocked %i, "
+ "free %zu, free_inc %zu, unused %zu",
+ atomic_read(&c->prio_written),
+ atomic_read(&c->set->prio_blocked), fifo_used(&c->free),
+ fifo_used(&c->free_inc), fifo_used(&c->unused));
+
+ if (cl) {
+ if (closure_blocking(cl))
+ mutex_unlock(&c->set->bucket_lock);
+
+ closure_wait_event(&c->set->bucket_wait, cl,
+ atomic_read(&c->prio_written) > 0 ||
+ can_save_prios(c));
+
+ if (closure_blocking(cl)) {
+ mutex_lock(&c->set->bucket_lock);
+ goto again;
+ }
+ }
+
+ return -1;
+}
+
+void unpop_bucket(struct cache_set *c, struct bkey *k)
+{
+ for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+ struct bucket *b = PTR_BUCKET(c, k, i);
+
+ b->mark = 0;
+ bucket_add_unused(PTR_CACHE(c, k, i), b);
+ }
+}
+
+int __pop_bucket_set(struct cache_set *c, uint16_t prio,
+ struct bkey *k, int n, struct closure *cl)
+{
+ lockdep_assert_held(&c->bucket_lock);
+ BUG_ON(!n || n > c->caches_loaded || n > 8);
+
+ k->header = KEY_HEADER(0, 0);
+
+ /* sort by free space/prio of oldest data in caches */
+
+ for (int i = 0; i < n; i++) {
+ struct cache *ca = c->cache_by_alloc[i];
+ long b = pop_bucket(ca, prio, cl);
+
+ if (b == -1)
+ goto err;
+
+ k->ptr[i] = PTR(ca->buckets[b].gen,
+ bucket_to_sector(c, b),
+ ca->sb.nr_this_dev);
+
+ SET_KEY_PTRS(k, i + 1);
+ }
+
+ return 0;
+err:
+ unpop_bucket(c, k);
+ __bkey_put(c, k);
+ return -1;
+}
+
+int pop_bucket_set(struct cache_set *c, uint16_t prio,
+ struct bkey *k, int n, struct closure *cl)
+{
+ int ret;
+ mutex_lock(&c->bucket_lock);
+ ret = __pop_bucket_set(c, prio, k, n, cl);
+ mutex_unlock(&c->bucket_lock);
+ return ret;
+}
diff --git a/drivers/block/bcache/io.c b/drivers/block/bcache/io.c
new file mode 100644
index 0000000..736d996
--- /dev/null
+++ b/drivers/block/bcache/io.c
@@ -0,0 +1,198 @@
+
+#include "bcache.h"
+#include "bset.h"
+#include "debug.h"
+
+/* Bios with headers */
+
+void bbio_free(struct bio *bio, struct cache_set *c)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ mempool_free(b, c->bio_meta);
+}
+
+struct bio *bbio_alloc(struct cache_set *c)
+{
+ struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+ struct bio *bio = &b->bio;
+
+ bio_init(bio);
+ bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+ bio->bi_max_vecs = bucket_pages(c);
+ bio->bi_io_vec = bio->bi_inline_vecs;
+
+ return bio;
+}
+
+static void bbio_destructor(struct bio *bio)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ kfree(b);
+}
+
+struct bio *bbio_kmalloc(gfp_t gfp, int vecs)
+{
+ struct bio *bio;
+ struct bbio *b;
+
+ b = kmalloc(sizeof(struct bbio) + sizeof(struct bio_vec) * vecs, gfp);
+ if (!b)
+ return NULL;
+
+ bio = &b->bio;
+ bio_init(bio);
+ bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+ bio->bi_max_vecs = vecs;
+ bio->bi_io_vec = bio->bi_inline_vecs;
+ bio->bi_destructor = bbio_destructor;
+
+ return bio;
+}
+
+struct bio *__bio_split_get(struct bio *bio, int len, struct bio_set *bs)
+{
+ struct bio *ret = bio_split_front(bio, len, bbio_kmalloc, GFP_NOIO, bs);
+
+ if (ret && ret != bio) {
+ closure_get(ret->bi_private);
+ ret->bi_rw &= ~REQ_UNPLUG;
+ }
+
+ return ret;
+}
+
+void __submit_bbio(struct bio *bio, struct cache_set *c)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+
+ bio->bi_sector = PTR_OFFSET(&b->key, 0);
+ bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
+
+ b->submit_time_us = local_clock_us();
+ generic_make_request(bio);
+}
+
+void submit_bbio(struct bio *bio, struct cache_set *c,
+ struct bkey *k, unsigned ptr)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ bkey_copy_single_ptr(&b->key, k, ptr);
+ __submit_bbio(bio, c);
+}
+
+int submit_bbio_split(struct bio *bio, struct cache_set *c,
+ struct bkey *k, unsigned ptr)
+{
+ struct closure *cl = bio->bi_private;
+ struct bbio *b;
+ struct bio *n;
+ unsigned sectors_done = 0;
+
+ closure_get(cl);
+
+ bio->bi_sector = PTR_OFFSET(k, ptr);
+ bio->bi_bdev = PTR_CACHE(c, k, ptr)->bdev;
+
+ do {
+ n = bio_split_get(bio, bio_max_sectors(bio), c);
+ if (!n) {
+ closure_put(cl);
+ return -ENOMEM;
+ }
+
+ b = container_of(n, struct bbio, bio);
+
+ bkey_copy_single_ptr(&b->key, k, ptr);
+ SET_KEY_SIZE(&b->key, KEY_SIZE(k) - sectors_done);
+ SET_PTR_OFFSET(&b->key, 0, PTR_OFFSET(k, ptr) + sectors_done);
+
+ b->submit_time_us = local_clock_us();
+ generic_make_request(n);
+ } while (n != bio);
+
+ return 0;
+}
+
+/* IO errors */
+
+void count_io_errors(struct cache *c, int error, const char *m)
+{
+ /*
+ * The halflife of an error is:
+ * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+ */
+
+ if (c->set->error_decay) {
+ unsigned count = atomic_inc_return(&c->io_count);
+
+ while (count > c->set->error_decay) {
+ unsigned errors;
+ unsigned old = count;
+ unsigned new = count - c->set->error_decay;
+
+ /*
+ * First we subtract refresh from count; each time we
+ * succesfully do so, we rescale the errors once:
+ */
+
+ count = atomic_cmpxchg(&c->io_count, old, new);
+
+ if (count == old) {
+ count = new;
+
+ errors = atomic_read(&c->io_errors);
+ do {
+ old = errors;
+ new = ((uint64_t) errors * 127) / 128;
+ errors = atomic_cmpxchg(&c->io_errors,
+ old, new);
+ } while (old != errors);
+ }
+ }
+ }
+
+ if (error) {
+ char buf[BDEVNAME_SIZE];
+ unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
+ &c->io_errors);
+ errors >>= IO_ERROR_SHIFT;
+
+ if (errors < c->set->error_limit)
+ err_printk("IO error on %s %s, recovering\n",
+ bdevname(c->bdev, buf), m);
+ else
+ cache_set_error(c->set, "too many IO errors", m);
+ }
+}
+
+void bcache_endio(struct cache_set *c, struct bio *bio,
+ int error, const char *m)
+{
+ struct closure *cl = bio->bi_private;
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ struct cache *ca = PTR_CACHE(c, &b->key, 0);
+
+ unsigned threshold = bio->bi_rw & REQ_WRITE
+ ? c->congested_write_threshold_us
+ : c->congested_read_threshold_us;
+
+ if (threshold) {
+ unsigned t = local_clock_us();
+
+ int us = t - b->submit_time_us;
+ int congested = atomic_read(&c->congested);
+
+ if (us > (int) threshold) {
+ int ms = us / 1024;
+ c->congested_last_us = t;
+
+ ms = min(ms, CONGESTED_MAX + congested);
+ atomic_sub(ms, &c->congested);
+ } else if (congested < 0)
+ atomic_inc(&c->congested);
+ }
+
+ count_io_errors(ca, error, m);
+ bio_put(bio);
+ closure_put(cl);
+}
diff --git a/drivers/block/bcache/request.c b/drivers/block/bcache/request.c
new file mode 100644
index 0000000..691fe8d
--- /dev/null
+++ b/drivers/block/bcache/request.c
@@ -0,0 +1,1470 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include "blk-cgroup.h"
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD 95
+#define CUTOFF_CACHE_READA 90
+#define CUTOFF_WRITEBACK 50
+#define CUTOFF_WRITEBACK_SYNC 75
+
+struct bio_passthrough {
+ struct closure cl;
+ struct cached_dev *d;
+ struct bio *bio;
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+struct kmem_cache *passthrough_cache;
+struct kmem_cache *search_cache;
+
+static void check_should_skip(struct cached_dev *, struct search *);
+
+static const char *search_type(struct search *s)
+{
+ return s->writeback ? "writeback"
+ : s->write ? "write" : "read";
+}
+
+/* Cgroup interface */
+
+#ifdef CONFIG_CGROUP_BCACHE
+static struct bcache_cgroup bcache_default_cgroup = { .cache_mode = -1 };
+
+struct bcache_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
+{
+ struct cgroup_subsys_state *css;
+ return cgroup &&
+ (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
+ ? container_of(css, struct bcache_cgroup, css)
+ : &bcache_default_cgroup;
+}
+
+struct bcache_cgroup *bio_to_cgroup(struct bio *bio)
+{
+ return cgroup_to_bcache(get_bio_cgroup(bio));
+}
+
+static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ char __user *buf, size_t nbytes, loff_t *ppos)
+{
+ char tmp[1024];
+ int len = sprint_string_list(tmp, bcache_cache_modes,
+ cgroup_to_bcache(cgrp)->cache_mode + 1);
+
+ if (len < 0)
+ return len;
+
+ return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
+
+static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buf)
+{
+ int v = read_string_list(buf, bcache_cache_modes);
+ if (v < 0)
+ return v;
+
+ cgroup_to_bcache(cgrp)->cache_mode = v - 1;
+ return 0;
+}
+
+static u64 bcache_verify_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return cgroup_to_bcache(cgrp)->verify;
+}
+
+static int bcache_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ cgroup_to_bcache(cgrp)->verify = val;
+ return 0;
+}
+
+static u64 bcache_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bcache_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+ return atomic_read(&bcachecg->stats.cache_hits);
+}
+
+static u64 bcache_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bcache_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+ return atomic_read(&bcachecg->stats.cache_misses);
+}
+
+static u64 bcache_cache_bypass_hits_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ struct bcache_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+ return atomic_read(&bcachecg->stats.cache_bypass_hits);
+}
+
+static u64 bcache_cache_bypass_misses_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ struct bcache_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+ return atomic_read(&bcachecg->stats.cache_bypass_misses);
+}
+
+struct cftype bcache_files[] = {
+ {
+ .name = "cache_mode",
+ .read = cache_mode_read,
+ .write_string = cache_mode_write,
+ },
+ {
+ .name = "verify",
+ .read_u64 = bcache_verify_read,
+ .write_u64 = bcache_verify_write,
+ },
+ {
+ .name = "cache_hits",
+ .read_u64 = bcache_cache_hits_read,
+ },
+ {
+ .name = "cache_misses",
+ .read_u64 = bcache_cache_misses_read,
+ },
+ {
+ .name = "cache_bypass_hits",
+ .read_u64 = bcache_cache_bypass_hits_read,
+ },
+ {
+ .name = "cache_bypass_misses",
+ .read_u64 = bcache_cache_bypass_misses_read,
+ },
+};
+
+static void init_bcache_cgroup(struct bcache_cgroup *cg)
+{
+ cg->cache_mode = -1;
+}
+
+static struct cgroup_subsys_state *
+bcachecg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ struct bcache_cgroup *cg;
+
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+ init_bcache_cgroup(cg);
+ return &cg->css;
+}
+
+static void bcachecg_destroy(struct cgroup_subsys *subsys,
+ struct cgroup *cgroup)
+{
+ struct bcache_cgroup *cg = cgroup_to_bcache(cgroup);
+ free_css_id(&bcache_subsys, &cg->css);
+ kfree(cg);
+}
+
+static int bcachecg_populate(struct cgroup_subsys *subsys,
+ struct cgroup *cgroup)
+{
+ return cgroup_add_files(cgroup, subsys, bcache_files,
+ ARRAY_SIZE(bcache_files));
+}
+
+struct cgroup_subsys bcache_subsys = {
+ .create = bcachecg_create,
+ .destroy = bcachecg_destroy,
+ .populate = bcachecg_populate,
+ .subsys_id = bcache_subsys_id,
+ .name = "bcache",
+ .module = THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(bcache_subsys);
+#endif
+
+static unsigned cache_mode(struct cached_dev *d, struct bio *bio)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+ int r = bio_to_cgroup(bio)->cache_mode;
+ if (r >= 0)
+ return r;
+#endif
+ return BDEV_CACHE_MODE(&d->sb);
+}
+
+static bool verify(struct cached_dev *d, struct bio *bio)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+ if (bio_to_cgroup(bio)->verify)
+ return true;
+#endif
+ return d->verify;
+}
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+ struct bio_vec *bv;
+ uint64_t csum = 0;
+ int i;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *d = kmap(bv->bv_page) + bv->bv_offset;
+ csum = crc64_update(csum, d, bv->bv_len);
+ kunmap(bv->bv_page);
+ }
+
+ k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
+}
+
+/* Insert data into cache */
+
+static void bio_invalidate(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = s->cache_bio;
+
+ pr_debug("invalidating %i sectors from %llu",
+ bio_sectors(bio), (uint64_t) bio->bi_sector);
+
+ while (bio_sectors(bio)) {
+ unsigned len = min(bio_sectors(bio), 1U << 14);
+ if (keylist_realloc(&s->op.keys, 0, s->op.d->c))
+ goto out;
+
+ bio->bi_sector += len;
+ bio->bi_size -= len << 9;
+
+ keylist_add(&s->op.keys,
+ &KEY(s->op.d->id, bio->bi_sector, len));
+ }
+
+ s->bio_insert_done = true;
+out:
+ continue_at(cl, bcache_journal, bcache_wq);
+}
+
+struct open_bucket {
+ struct list_head list;
+ struct task_struct *last;
+ unsigned sectors_free;
+ BKEY_PADDED(key);
+};
+
+void bcache_open_buckets_free(struct cache_set *c)
+{
+ struct open_bucket *b;
+
+ while (!list_empty(&c->data_buckets)) {
+ b = list_first_entry(&c->data_buckets,
+ struct open_bucket, list);
+ list_del(&b->list);
+ kfree(b);
+ }
+}
+
+int bcache_open_buckets_alloc(struct cache_set *c)
+{
+ spin_lock_init(&c->data_bucket_lock);
+
+ for (int i = 0; i < 6; i++) {
+ struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ list_add(&b->list, &c->data_buckets);
+ }
+
+ return 0;
+}
+
+static void put_data_bucket(struct open_bucket *b, struct cache_set *c,
+ struct bkey *k, struct bio *bio)
+{
+ unsigned split = min(bio_sectors(bio), b->sectors_free);
+
+ for (unsigned i = 0; i < KEY_PTRS(&b->key); i++)
+ split = min(split, __bio_max_sectors(bio,
+ PTR_CACHE(c, &b->key, i)->bdev,
+ PTR_OFFSET(&b->key, i)));
+
+ b->key.key += split;
+
+ bkey_copy(k, &b->key);
+ SET_KEY_SIZE(k, split);
+
+ b->sectors_free -= split;
+
+ /* If we're closing this open bucket, get_data_bucket()'s refcount now
+ * belongs to the key that's being inserted
+ */
+ if (b->sectors_free < c->sb.block_size)
+ b->sectors_free = 0;
+ else
+ for (unsigned i = 0; i < KEY_PTRS(&b->key); i++)
+ atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+ for (unsigned i = 0; i < KEY_PTRS(&b->key); i++) {
+ atomic_long_add(split,
+ &PTR_CACHE(c, &b->key, i)->sectors_written);
+
+ SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + split);
+ }
+
+ spin_unlock(&c->data_bucket_lock);
+}
+
+static struct open_bucket *get_data_bucket(struct bkey *search,
+ struct search *s)
+{
+ struct closure cl, *w = NULL;
+ struct cache_set *c = s->op.d->c;
+ struct open_bucket *l, *ret, *ret_task;
+
+ BKEY_PADDED(key) alloc;
+ struct bkey *k = NULL;
+
+ if (s->writeback) {
+ closure_init_stack(&cl);
+ w = &cl;
+ }
+again:
+ ret = ret_task = NULL;
+
+ spin_lock(&c->data_bucket_lock);
+ list_for_each_entry_reverse(l, &c->data_buckets, list)
+ if (!bkey_cmp(&l->key, search)) {
+ ret = l;
+ goto found;
+ } else if (l->last == s->task)
+ ret_task = l;
+
+ ret = ret_task ?: list_first_entry(&c->data_buckets,
+ struct open_bucket, list);
+found:
+ if (!ret->sectors_free) {
+ if (!k) {
+ spin_unlock(&c->data_bucket_lock);
+ k = &alloc.key;
+
+ if (pop_bucket_set(c, initial_prio, k, 1, w))
+ return NULL;
+
+ goto again;
+ }
+
+ bkey_copy(&ret->key, k);
+ k = NULL;
+
+ ret->sectors_free = c->sb.bucket_size;
+ } else
+ for (unsigned i = 0; i < KEY_PTRS(&ret->key); i++)
+ EBUG_ON(ptr_stale(c, &ret->key, i));
+
+ if (k)
+ __bkey_put(c, k);
+
+ if (w)
+ for (unsigned i = 0; i < KEY_PTRS(&ret->key); i++)
+ PTR_BUCKET(c, &ret->key, i)->mark = GC_MARK_DIRTY;
+
+ ret->last = s->task;
+ bkey_copy_key(&ret->key, search);
+
+ list_move_tail(&ret->list, &c->data_buckets);
+ return ret;
+}
+
+static void bio_insert_error(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+ /*
+ * Our data write just errored, which means we've got a bunch of keys to
+ * insert that point to data that wasn't succesfully written.
+ *
+ * We don't have to insert those keys but we still have to invalidate
+ * that region of the cache - so, if we just strip off all the pointers
+ * from the keys we'll accomplish just that.
+ */
+
+ struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
+
+ while (src != op->keys.top) {
+ struct bkey *n = next(src);
+
+ SET_KEY_PTRS(src, 0);
+ bkey_copy(dst, src);
+
+ dst = next(dst);
+ src = n;
+ }
+
+ op->keys.top = dst;
+
+ bcache_journal(cl);
+}
+
+static void bio_insert_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct search *s = container_of(op, struct search, op);
+
+ if (error) {
+ /* TODO: We could try to recover from this. */
+ if (s->writeback)
+ s->error = error;
+ else if (s->write)
+ set_closure_fn(cl, bio_insert_error, bcache_wq);
+ else
+ set_closure_fn(cl, NULL, NULL);
+ }
+
+ bcache_endio(op->d->c, bio, error, "writing data to cache");
+}
+
+static void bio_insert_loop(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = s->cache_bio, *n;
+ unsigned sectors = bio_sectors(bio);
+
+ if (s->skip)
+ return bio_invalidate(cl);
+
+ if (atomic_sub_return(bio_sectors(bio), &op->d->c->sectors_to_gc) < 0) {
+ set_gc_sectors(op->d->c);
+ bcache_queue_gc(op->d->c);
+ }
+
+ do {
+ struct open_bucket *b;
+ struct bkey *k;
+
+ /* 1 for the device pointer and 1 for the chksum */
+ if (keylist_realloc(&op->keys,
+ 1 + (op->d->data_csum ? 1 : 0),
+ op->d->c))
+ continue_at(cl, bcache_journal, bcache_wq);
+
+ k = op->keys.top;
+
+ b = get_data_bucket(&KEY(op->d->id, bio->bi_sector, 0), s);
+ if (!b)
+ goto err;
+
+ put_data_bucket(b, op->d->c, k, bio);
+
+ n = bio_split_get(bio, KEY_SIZE(k), op->d);
+ if (!n) {
+ __bkey_put(op->d->c, k);
+ continue_at(cl, bio_insert_loop, bcache_wq);
+ }
+
+ if (s->writeback)
+ SET_KEY_DIRTY(k, true);
+
+ SET_KEY_CSUM(k, op->d->data_csum);
+ if (op->d->data_csum)
+ bio_csum(n, k);
+
+ pr_debug("%s", pkey(k));
+ keylist_push(&op->keys);
+
+ n->bi_rw |= REQ_WRITE;
+
+ if (n == bio)
+ closure_get(cl);
+
+ trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
+ submit_bbio(n, op->d->c, k, 0);
+ } while (n != bio);
+
+ s->bio_insert_done = true;
+ continue_at(cl, bcache_journal, bcache_wq);
+err:
+ /* IO never happened, so bbio key isn't set up, so we can't call
+ * bio_endio()
+ */
+ bio_put(bio);
+
+ pr_debug("error for %s, %i/%i sectors done, bi_sector %llu",
+ search_type(s), sectors - bio_sectors(bio), sectors,
+ (uint64_t) bio->bi_sector);
+
+ if (s->writeback) {
+ /* This is dead code now, since we handle all memory allocation
+ * failures and block if we don't have free buckets
+ */
+ BUG();
+ /* Lookup in in_writeback rb tree, wait on appropriate
+ * closure, then invalidate in btree and do normal
+ * write
+ */
+ s->bio_insert_done = true;
+ s->error = -ENOMEM;
+ } else if (s->write) {
+ s->skip = true;
+ return bio_invalidate(cl);
+ } else
+ s->bio_insert_done = true;
+
+ if (!keylist_empty(&op->keys))
+ continue_at(cl, bcache_journal, bcache_wq);
+ else
+ closure_return(cl);
+}
+
+static void bio_insert(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = s->cache_bio;
+
+ if (!s->skip) {
+ bio->bi_end_io = bio_insert_endio;
+ bio->bi_private = cl;
+ bio_get(bio);
+ }
+
+ keylist_init(&op->keys);
+ bio_insert_loop(cl);
+}
+
+void bcache_btree_insert_async(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct search *s = container_of(op, struct search, op);
+
+ if (bcache_btree_insert(op, op->d->c)) {
+ s->error = -ENOMEM;
+ s->bio_insert_done = true;
+ }
+
+ if (s->bio_insert_done) {
+ keylist_free(&op->keys);
+ closure_return(cl);
+ } else
+ continue_at(cl, bio_insert_loop, bcache_wq);
+}
+
+/* Common code for the make_request functions */
+
+static void __bio_complete(struct search *s)
+{
+ if (s->orig_bio) {
+ if (s->error)
+ clear_bit(BIO_UPTODATE, &s->orig_bio->bi_flags);
+
+ trace_bcache_request_end(&s->op, s->orig_bio);
+ bio_endio(s->orig_bio, s->error);
+ s->orig_bio = NULL;
+ }
+}
+
+static void request_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+
+ if (error) {
+ struct search *s = container_of(cl, struct search, cl);
+ s->error = error;
+ /* Only cache read errors are recoverable */
+ s->recoverable = false;
+ }
+
+ bio_put(bio);
+ closure_put(cl);
+}
+
+void cache_read_endio(struct bio *bio, int error)
+{
+ struct bbio *b = container_of(bio, struct bbio, bio);
+ struct closure *cl = bio->bi_private;
+ struct search *s = container_of(cl, struct search, cl);
+
+ /*
+ * If the bucket was reused while our bio was in flight, we might have
+ * read the wrong data. Set s->error but not error so it doesn't get
+ * counted against the cache device, but we'll still reread the data
+ * from the backing device.
+ */
+
+ if (error)
+ s->error = error;
+ else if (ptr_stale(s->op.d->c, &b->key, 0)) {
+ atomic_long_inc(&s->op.d->c->cache_read_races);
+ s->error = -EINTR;
+ }
+
+ bcache_endio(s->op.d->c, bio, error, "reading from cache");
+}
+
+static void __do_bio_hook(struct search *s)
+{
+ struct bio *bio = &s->bio.bio;
+ memcpy(bio, s->orig_bio, sizeof(struct bio));
+
+#ifdef CONFIG_DISKMON
+ bio->bi_flowid = NULL;
+#endif
+ bio->bi_end_io = request_endio;
+ bio->bi_private = &s->cl;
+ bio->bi_destructor = NULL;
+ atomic_set(&bio->bi_cnt, 3);
+}
+
+static struct search *do_bio_hook(struct bio *bio, struct bcache_device *d)
+{
+ struct bio_vec *bv;
+ struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
+ memset(s, 0, offsetof(struct search, op.keys));
+
+ __closure_init(&s->cl, NULL);
+ __closure_init(&s->op.cl, &s->cl);
+
+ s->op.d = d;
+ s->op.lock = -1;
+ s->task = get_current();
+ s->orig_bio = bio;
+ s->write = bio->bi_rw & REQ_WRITE;
+ s->op.flush_journal = bio->bi_rw & REQ_FLUSH;
+ s->recoverable = 1;
+ __do_bio_hook(s);
+
+ if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
+ bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
+ memcpy(bv, bio_iovec(bio),
+ sizeof(struct bio_vec) * bio_segments(bio));
+
+ s->bio.bio.bi_io_vec = bv;
+ s->unaligned_bvec = 1;
+ }
+
+ return s;
+}
+
+static void btree_read_async(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+ int ret = btree_root(search_recurse, op->d->c, op);
+
+ if (ret == -EAGAIN)
+ continue_at(cl, btree_read_async, bcache_wq);
+
+ closure_return(cl);
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->op.d, struct cached_dev, disk);
+
+ if (s->cache_bio)
+ bio_put(s->cache_bio);
+
+ if (s->unaligned_bvec)
+ mempool_free(s->bio.bio.bi_io_vec, dc->disk.unaligned_bvec);
+
+ __bio_complete(s);
+
+ closure_debug_destroy(&s->cl);
+ mempool_free(s, dc->disk.c->search);
+ cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_read_complete(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+
+ if (s->cache_miss)
+ bio_put(s->cache_miss);
+
+ if (s->cache_bio) {
+ int i;
+ struct bio_vec *bv;
+
+ __bio_for_each_segment(bv, s->cache_bio, i, 0)
+ __free_page(bv->bv_page);
+ }
+
+ cached_dev_bio_complete(cl);
+}
+
+static void request_read_error(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bio_vec *bv;
+ int i;
+
+ if (s->recoverable) {
+ /* The cache read failed, but we can retry from the backing
+ * device.
+ */
+ pr_debug("recovering at sector %llu",
+ (uint64_t) s->orig_bio->bi_sector);
+
+ s->error = 0;
+ bv = s->bio.bio.bi_io_vec;
+ __do_bio_hook(s);
+ s->bio.bio.bi_io_vec = bv;
+
+ if (!s->unaligned_bvec)
+ bio_for_each_segment(bv, s->orig_bio, i)
+ bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
+ else
+ memcpy(s->bio.bio.bi_io_vec,
+ bio_iovec(s->orig_bio),
+ sizeof(struct bio_vec) *
+ bio_segments(s->orig_bio));
+
+ /* XXX: invalidate cache */
+
+ trace_bcache_read_retry(&s->bio.bio);
+ closure_bio_submit(&s->bio.bio, &s->cl, s->op.d->c->bio_split);
+ }
+
+ continue_at(cl, cached_dev_read_complete, NULL);
+}
+
+static void request_read_done(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *d = container_of(s->op.d, struct cached_dev, disk);
+
+ /*
+ * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
+ * contains data ready to be inserted into the cache.
+ *
+ * First, we copy the data we just read from cache_bio's bounce buffers
+ * to the buffers the original bio pointed to:
+ */
+
+ if (s->cache_bio) {
+ struct bio_vec *src, *dst;
+ unsigned src_offset, dst_offset, bytes;
+ void *dst_ptr;
+
+ bio_reset(s->cache_bio);
+ atomic_set(&s->cache_bio->bi_cnt, 1);
+ s->cache_bio->bi_sector = s->cache_miss->bi_sector;
+ s->cache_bio->bi_bdev = s->cache_miss->bi_bdev;
+ s->cache_bio->bi_size = s->cache_bio_sectors << 9;
+ bio_map(s->cache_bio, NULL);
+
+ src = bio_iovec(s->cache_bio);
+ dst = bio_iovec(s->cache_miss);
+ src_offset = src->bv_offset;
+ dst_offset = dst->bv_offset;
+ dst_ptr = kmap(dst->bv_page);
+
+ while (1) {
+ if (dst_offset == dst->bv_offset + dst->bv_len) {
+ kunmap(dst->bv_page);
+ dst++;
+ if (dst == bio_iovec_idx(s->cache_miss,
+ s->cache_miss->bi_vcnt))
+ break;
+
+ dst_offset = dst->bv_offset;
+ dst_ptr = kmap(dst->bv_page);
+ }
+
+ if (src_offset == src->bv_offset + src->bv_len) {
+ src++;
+ if (src == bio_iovec_idx(s->cache_bio,
+ s->cache_bio->bi_vcnt))
+ BUG();
+
+ src_offset = src->bv_offset;
+ }
+
+ bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
+ src->bv_offset + src->bv_len - src_offset);
+
+ memcpy(dst_ptr + dst_offset,
+ page_address(src->bv_page) + src_offset,
+ bytes);
+
+ src_offset += bytes;
+ dst_offset += bytes;
+ }
+ }
+
+ if (verify(d, &s->bio.bio) && s->recoverable)
+ data_verify(s);
+
+ __bio_complete(s);
+
+ if (s->cache_bio && !atomic_read(&s->op.d->c->closing)) {
+ s->op.type = BTREE_REPLACE;
+ closure_init(&s->op.cl, &s->cl);
+ bio_insert(&s->op.cl);
+ }
+
+ continue_at(cl, cached_dev_read_complete, NULL);
+}
+
+static void request_read_done_bh(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *d = container_of(s->op.d, struct cached_dev, disk);
+
+ if (s->cache_miss && s->op.insert_collision)
+ mark_cache_miss_collision(&s->op);
+
+ mark_cache_accounting(s, !s->cache_miss, s->skip);
+
+ if (s->error)
+ set_closure_fn(cl, request_read_error, bcache_wq);
+ else if (s->cache_bio || verify(d, &s->bio.bio))
+ set_closure_fn(cl, request_read_done, bcache_wq);
+ else
+ set_closure_fn(cl, cached_dev_read_complete, NULL);
+
+ closure_queue(cl);
+}
+
+static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ struct bio *bio, unsigned sectors)
+{
+ int ret = 0;
+ unsigned reada;
+ struct cached_dev *d = container_of(s->op.d, struct cached_dev, disk);
+ struct bio *n;
+
+ sectors = min(sectors, bio_max_sectors(bio)),
+
+ n = bio_split_get(bio, sectors, s->op.d);
+ if (!n)
+ return -EAGAIN;
+
+ if (n == bio)
+ s->op.lookup_done = true;
+
+ if (s->cache_miss || s->skip)
+ goto out_submit;
+
+ if (n != bio ||
+ (bio->bi_rw & REQ_RAHEAD) ||
+ (bio->bi_rw & REQ_META) ||
+ s->op.d->c->gc_stats.in_use >= CUTOFF_CACHE_READA)
+ reada = 0;
+ else
+ reada = min(d->readahead >> 9, sectors - bio_sectors(n));
+
+ s->cache_bio_sectors = bio_sectors(n) + reada;
+ s->cache_bio = bbio_kmalloc(GFP_NOIO,
+ DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS));
+
+ if (!s->cache_bio)
+ goto out_submit;
+
+ s->cache_bio->bi_sector = n->bi_sector;
+ s->cache_bio->bi_bdev = n->bi_bdev;
+ s->cache_bio->bi_size = s->cache_bio_sectors << 9;
+
+ s->cache_bio->bi_end_io = request_endio;
+ s->cache_bio->bi_private = &s->cl;
+
+ /* btree_search_recurse()'s btree iterator is no good anymore */
+ ret = -EINTR;
+ if (!btree_insert_check_key(b, &s->op, s->cache_bio))
+ goto out_put;
+
+ bio_map(s->cache_bio, NULL);
+ if (bio_alloc_pages(s->cache_bio, __GFP_NOWARN|GFP_NOIO))
+ goto out_put;
+
+ s->cache_miss = n;
+ bio_get(s->cache_bio);
+
+ trace_bcache_cache_miss(s->orig_bio);
+ generic_make_request(s->cache_bio);
+
+ return ret;
+out_put:
+ bio_put(s->cache_bio);
+ s->cache_bio = NULL;
+out_submit:
+ generic_make_request(n);
+ return ret;
+}
+
+static void request_read(struct cached_dev *d, struct search *s)
+{
+ check_should_skip(d, s);
+
+ set_closure_fn(&s->cl, request_read_done_bh, NULL);
+ closure_set_stopped(&s->cl);
+
+ btree_read_async(&s->op.cl);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->op.d, struct cached_dev, disk);
+
+ up_read_non_owner(&dc->writeback_lock);
+ cached_dev_bio_complete(cl);
+}
+
+static bool should_writeback(struct cached_dev *d, struct bio *bio)
+{
+ return !atomic_read(&d->disk.detaching) &&
+ cache_mode(d, bio) == CACHE_MODE_WRITEBACK &&
+ (d->disk.c->gc_stats.in_use < (bio->bi_rw & REQ_SYNC)
+ ? CUTOFF_WRITEBACK_SYNC
+ : CUTOFF_WRITEBACK);
+}
+
+static void request_write_resubmit(struct closure *cl)
+{
+ struct btree_op *op = container_of(cl, struct btree_op, cl);
+ struct search *s = container_of(op, struct search, op);
+ struct bio *bio = &s->bio.bio;
+
+ closure_bio_submit(bio, &s->cl, op->d->c->bio_split);
+
+ bio_insert(&s->op.cl);
+ continue_at(&s->cl, cached_dev_write_complete, NULL);
+}
+
+static void request_write(struct cached_dev *d, struct search *s)
+{
+ struct closure *cl = &s->cl;
+ struct bio *bio = &s->bio.bio;
+
+ check_should_skip(d, s);
+ down_read_non_owner(&d->writeback_lock);
+
+ if (bcache_in_writeback(d, bio->bi_sector, bio_sectors(bio))) {
+ s->skip = false;
+ s->writeback = true;
+ }
+
+ if (s->skip) {
+skip: s->cache_bio = s->orig_bio;
+ bio_get(s->cache_bio);
+ trace_bcache_write_skip(s->orig_bio);
+
+ if (bio->bi_rw & (1 << BIO_RW_DISCARD)) {
+ closure_get(cl);
+
+ if (blk_queue_discard(bdev_get_queue(d->bdev)))
+ generic_make_request(bio);
+ else
+ bio_endio(bio, 0);
+
+ goto out;
+ } else
+ goto submit;
+ }
+
+ if (should_writeback(d, s->orig_bio))
+ s->writeback = true;
+
+ if (!s->writeback) {
+ s->cache_bio = bbio_kmalloc(GFP_NOIO, bio->bi_max_vecs);
+ if (!s->cache_bio) {
+ s->skip = true;
+ goto skip;
+ }
+
+ __bio_clone(s->cache_bio, bio);
+ trace_bcache_writethrough(s->orig_bio);
+submit:
+ if (closure_bio_submit(bio, cl, s->op.d->c->bio_split))
+ continue_at(&s->op.cl,
+ request_write_resubmit,
+ bcache_wq);
+ } else {
+ s->cache_bio = bio;
+ trace_bcache_writeback(s->orig_bio);
+ bcache_writeback_add(d, bio_sectors(bio));
+ }
+out:
+ bio_insert(&s->op.cl);
+ continue_at(cl, cached_dev_write_complete, NULL);
+}
+
+static void request_nodata(struct cached_dev *d, struct search *s)
+{
+ struct closure *cl = &s->cl;
+ struct bio *bio = &s->bio.bio;
+
+ if (bio->bi_rw & (1 << BIO_RW_DISCARD)) {
+ request_write(d, s);
+ return;
+ }
+
+ if (s->op.flush_journal)
+ bcache_journal_meta(s->op.d->c, cl);
+
+ closure_get(cl);
+ generic_make_request(bio);
+
+ closure_set_stopped(&s->op.cl);
+ closure_put(&s->op.cl);
+
+ continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+/* Split bios in passthrough mode */
+
+static void bio_passthrough_done(struct closure *cl)
+{
+ struct bio_passthrough *s = container_of(cl, struct bio_passthrough,
+ cl);
+
+ s->bio->bi_end_io = s->bi_end_io;
+ s->bio->bi_private = s->bi_private;
+ bio_endio(s->bio, 0);
+
+ closure_debug_destroy(&s->cl);
+ mempool_free(s, s->d->bio_passthrough);
+}
+
+static void bio_passthrough_endio(struct bio *bio, int error)
+{
+ struct closure *cl = bio->bi_private;
+ struct bio_passthrough *s = container_of(cl, struct bio_passthrough,
+ cl);
+
+ if (error)
+ clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+
+ bio_put(bio);
+ closure_put(cl);
+}
+
+static void bio_passthrough_submit(struct closure *cl)
+{
+ struct bio_passthrough *s = container_of(cl, struct bio_passthrough,
+ cl);
+ struct bio *bio = s->bio, *n;
+
+ do {
+ n = bio_split_get(bio, bio_max_sectors(bio), &s->d->disk);
+ if (!n)
+ continue_at(cl, bio_passthrough_submit, bcache_wq);
+
+ if (n == bio) {
+ set_closure_fn(cl, bio_passthrough_done, NULL);
+ closure_set_stopped(cl);
+ }
+
+ trace_bcache_passthrough(n);
+ generic_make_request(n);
+ } while (n != bio);
+}
+
+static void bio_passthrough(struct cached_dev *d, struct bio *bio)
+{
+ struct bio_passthrough *s;
+
+ if (bio->bi_rw & (1 << BIO_RW_DISCARD)) {
+ if (!blk_queue_discard(bdev_get_queue(d->bdev)))
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
+
+ return;
+ }
+
+ if (!bio_has_data(bio) ||
+ bio->bi_size <= bio_max_sectors(bio) << 9) {
+ generic_make_request(bio);
+ return;
+ }
+
+ s = mempool_alloc(d->bio_passthrough, GFP_NOIO);
+
+ closure_init(&s->cl, NULL);
+ s->d = d;
+ s->bio = bio;
+ s->bi_end_io = bio->bi_end_io;
+ s->bi_private = bio->bi_private;
+
+ bio_get(bio);
+ bio->bi_end_io = bio_passthrough_endio;
+ bio->bi_private = &s->cl;
+
+ bio_passthrough_submit(&s->cl);
+}
+
+/* Cached devices - read & write stuff */
+
+int bcache_get_congested(struct cache_set *c)
+{
+ int i;
+
+ if (!c->congested_read_threshold_us &&
+ !c->congested_write_threshold_us)
+ return 0;
+
+ i = (local_clock_us() - c->congested_last_us) / 1024;
+ if (i < 0)
+ return 0;
+
+ i += atomic_read(&c->congested);
+ if (i >= 0)
+ return 0;
+
+ i += CONGESTED_MAX;
+
+ return i <= 0 ? 1 : fract_exp_two(i, 6);
+}
+
+static void check_should_skip(struct cached_dev *d, struct search *s)
+{
+ void add_sequential(struct task_struct *t)
+ {
+ ewma_add(t->sequential_io_avg,
+ t->sequential_io, 8, 0);
+
+ t->sequential_io = 0;
+ }
+
+ struct hlist_head *iohash(uint64_t k)
+ { return &d->io_hash[hash_64(k, RECENT_IO_BITS)]; }
+
+ struct cache_set *c = s->op.d->c;
+ struct bio *bio = &s->bio.bio;
+
+ int cutoff = bcache_get_congested(c);
+ unsigned mode = cache_mode(d, bio);
+
+ if (atomic_read(&d->disk.detaching) ||
+ c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+ (bio->bi_rw & (1 << BIO_RW_DISCARD)))
+ goto skip;
+
+ if (mode == CACHE_MODE_NONE ||
+ (mode == CACHE_MODE_WRITEAROUND &&
+ (bio->bi_rw & REQ_WRITE)))
+ goto skip;
+
+ if (bio->bi_sector & (c->sb.block_size - 1) ||
+ bio_sectors(bio) & (c->sb.block_size - 1)) {
+ pr_debug("skipping unaligned io");
+ goto skip;
+ }
+
+ if (!cutoff) {
+ cutoff = d->sequential_cutoff >> 9;
+
+ if (!cutoff)
+ goto rescale;
+
+ if (mode == CACHE_MODE_WRITEBACK &&
+ (bio->bi_rw & REQ_WRITE) &&
+ (bio->bi_rw & REQ_SYNC))
+ goto rescale;
+ }
+
+ if (d->sequential_merge) {
+ struct hlist_node *cursor;
+ struct io *i;
+
+ spin_lock(&d->io_lock);
+
+ hlist_for_each_entry(i, cursor, iohash(bio->bi_sector), hash)
+ if (i->last == bio->bi_sector &&
+ time_before(jiffies, i->jiffies))
+ goto found;
+
+ i = list_first_entry(&d->io_lru, struct io, lru);
+
+ add_sequential(s->task);
+ i->sequential = 0;
+found:
+ if (i->sequential + bio->bi_size > i->sequential)
+ i->sequential += bio->bi_size;
+
+ i->last = bio_end(bio);
+ i->jiffies = jiffies + msecs_to_jiffies(5000);
+ s->task->sequential_io = i->sequential;
+
+ hlist_del(&i->hash);
+ hlist_add_head(&i->hash, iohash(i->last));
+ list_move_tail(&i->lru, &d->io_lru);
+
+ spin_unlock(&d->io_lock);
+ } else {
+ s->task->sequential_io = bio->bi_size;
+
+ add_sequential(s->task);
+ }
+
+ cutoff -= popcount_32(get_random_int());
+
+ if (cutoff <= (int) (max(s->task->sequential_io,
+ s->task->sequential_io_avg) >> 9))
+ goto skip;
+
+rescale:
+ rescale_priorities(c, bio_sectors(bio));
+ return;
+skip:
+ mark_sectors_bypassed(s, bio_sectors(bio));
+ s->skip = true;
+}
+
+static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct search *s;
+ struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+ bio->bi_bdev = dc->bdev;
+ bio->bi_sector += 16;
+
+ if (cached_dev_get(dc)) {
+ s = do_bio_hook(bio, d);
+ trace_bcache_request_start(&s->op, bio);
+
+ (!bio_has_data(bio) ? request_nodata :
+ bio->bi_rw & REQ_WRITE ? request_write :
+ request_read)(dc, s);
+ } else
+ bio_passthrough(dc, bio);
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+ struct bcache_device *d = data;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ struct request_queue *q = bdev_get_queue(dc->bdev);
+ int ret = 0;
+
+ if (bdi_congested(&q->backing_dev_info, bits))
+ return 1;
+
+ if (cached_dev_get(dc)) {
+ struct cache *ca;
+
+ for_each_cache(ca, d->c) {
+ q = bdev_get_queue(ca->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+
+ cached_dev_put(dc);
+ }
+
+ return ret;
+}
+
+void cached_dev_request_init(struct cached_dev *d)
+{
+ struct gendisk *g = d->disk.disk;
+
+ g->queue->make_request_fn = cached_dev_make_request;
+ g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+ d->disk.cache_miss = cached_dev_cache_miss;
+ d->disk.ioctl = cached_dev_ioctl;
+}
+
+/* Flash backed devices */
+
+static void flash_dev_bio_complete(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bcache_device *d = s->op.d;
+
+ __bio_complete(s);
+
+ if (s->cache_bio) {
+ int i;
+ struct bio_vec *bv;
+
+ if (!s->write)
+ __bio_for_each_segment(bv, s->cache_bio, i, 0)
+ __free_page(bv->bv_page);
+ bio_put(s->cache_bio);
+ }
+
+ if (s->unaligned_bvec)
+ mempool_free(s->bio.bio.bi_io_vec, d->unaligned_bvec);
+
+ closure_debug_destroy(&s->cl);
+ mempool_free(s, d->c->search);
+}
+
+static int flash_dev_cache_miss(struct btree *b, struct search *s,
+ struct bio *bio, unsigned sectors)
+{
+ sectors = min(sectors, bio_sectors(bio));
+
+ /* Zero fill bio */
+
+ while (sectors) {
+ struct bio_vec *bv = bio_iovec(bio);
+ unsigned j = min(bv->bv_len >> 9, sectors);
+
+ void *p = kmap(bv->bv_page);
+ memset(p + bv->bv_offset, 0, j << 9);
+ kunmap(bv->bv_page);
+
+ bv->bv_len -= j << 9;
+ bv->bv_offset += j << 9;
+
+ bio->bi_sector += j;
+ bio->bi_size -= j << 9;
+
+ bio->bi_idx++;
+ sectors -= j;
+ }
+
+ if (sectors >= bio_sectors(bio)) {
+ s->op.lookup_done = true;
+ bio_endio(bio, 0);
+ }
+ return 0;
+}
+
+static void flash_dev_read(struct search *s)
+{
+ set_closure_fn(&s->cl, flash_dev_bio_complete, NULL);
+ closure_set_stopped(&s->cl);
+
+ btree_read_async(&s->op.cl);
+}
+
+static void flash_dev_write(struct search *s)
+{
+ struct closure *cl = &s->cl;
+ struct bio *bio = &s->bio.bio;
+
+ if (bio->bi_rw & (1 << BIO_RW_DISCARD)) {
+ s->cache_bio = s->orig_bio;
+ s->skip = true;
+
+ closure_get(cl);
+ bio_get(s->cache_bio);
+ bio_endio(bio, 0);
+ } else {
+ s->writeback = true;
+ s->cache_bio = bio;
+ }
+
+ bio_insert(&s->op.cl);
+ continue_at(&s->cl, flash_dev_bio_complete, NULL);
+}
+
+static void flash_dev_req_nodata(struct search *s)
+{
+ struct closure *cl = &s->cl;
+ struct bio *bio = &s->bio.bio;
+
+ if (bio->bi_rw & (1 << BIO_RW_DISCARD)) {
+ flash_dev_write(s);
+ return;
+ }
+
+ if (s->op.flush_journal)
+ bcache_journal_meta(s->op.d->c, cl);
+
+ closure_get(cl);
+ generic_make_request(bio);
+
+ closure_set_stopped(&s->op.cl);
+ closure_put(&s->op.cl);
+
+ continue_at(&s->cl, flash_dev_bio_complete, NULL);
+}
+
+static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct search *s;
+ struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+
+ s = do_bio_hook(bio, d);
+ trace_bcache_request_start(&s->op, bio);
+
+ (!bio_has_data(bio) ? flash_dev_req_nodata :
+ bio->bi_rw & REQ_WRITE ? flash_dev_write :
+ flash_dev_read)(s);
+}
+
+static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ return -ENOTTY;
+}
+
+static int flash_dev_congested(void *data, int bits)
+{
+ struct bcache_device *d = data;
+ struct request_queue *q;
+ struct cache *ca;
+ int ret = 0;
+
+ for_each_cache(ca, d->c) {
+ q = bdev_get_queue(ca->bdev);
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ }
+
+ return ret;
+}
+
+void flash_dev_request_init(struct bcache_device *d)
+{
+ struct gendisk *g = d->disk;
+
+ g->queue->make_request_fn = flash_dev_make_request;
+ g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+ d->cache_miss = flash_dev_cache_miss;
+ d->ioctl = flash_dev_ioctl;
+}
+
+void bcache_request_exit(void)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+ cgroup_unload_subsys(&bcache_subsys);
+#endif
+ if (passthrough_cache)
+ kmem_cache_destroy(passthrough_cache);
+ if (search_cache)
+ kmem_cache_destroy(search_cache);
+}
+
+int __init bcache_request_init(void)
+{
+ if (!(search_cache = KMEM_CACHE(search, 0)) ||
+ !(passthrough_cache = KMEM_CACHE(bio_passthrough, 0)))
+ goto err;
+
+#ifdef CONFIG_CGROUP_BCACHE
+ cgroup_load_subsys(&bcache_subsys);
+ init_bcache_cgroup(&bcache_default_cgroup);
+#endif
+ return 0;
+err:
+ bcache_request_exit();
+ return -ENOMEM;
+}
diff --git a/drivers/block/bcache/request.h b/drivers/block/bcache/request.h
new file mode 100644
index 0000000..7e1b11a
--- /dev/null
+++ b/drivers/block/bcache/request.h
@@ -0,0 +1,58 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+#include <linux/cgroup.h>
+
+struct search {
+ /* Stack frame for bio_complete */
+ struct closure cl;
+
+ struct task_struct *task;
+
+ struct bbio bio;
+ struct bio *orig_bio;
+ struct bio *cache_bio;
+ struct bio *cache_miss;
+ unsigned cache_bio_sectors;
+
+ unsigned recoverable:1;
+ unsigned unaligned_bvec:1;
+ unsigned skip:1;
+ unsigned write:1;
+ unsigned writeback:1;
+
+ unsigned bio_insert_done:1;
+
+ /* IO error returned to s->bio */
+ short error;
+
+ /* Anything past op->keys won't get zeroed in do_bio_hook */
+ struct btree_op op;
+};
+
+void cache_read_endio(struct bio *, int);
+int bcache_get_congested(struct cache_set *);
+void bcache_btree_insert_async(struct closure *);
+
+void bcache_open_buckets_free(struct cache_set *);
+int bcache_open_buckets_alloc(struct cache_set *);
+
+void cached_dev_request_init(struct cached_dev *d);
+void flash_dev_request_init(struct bcache_device *d);
+
+extern struct kmem_cache *search_cache, *passthrough_cache;
+
+struct bcache_cgroup {
+#ifdef CONFIG_CGROUP_BCACHE
+ struct cgroup_subsys_state css;
+#endif
+ /*
+ * We subtract one from the index into bcache_cache_modes[], so that
+ * default == -1; this makes it so the rest match up with d->cache_mode,
+ * and we use d->cache_mode if cgrp->cache_mode < 0
+ */
+ short cache_mode;
+ bool verify;
+ struct cache_stat_collector stats;
+};
+
+#endif /* _BCACHE_REQUEST_H_ */
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* [Bcache v13 15/16] bcache: Writeback
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (9 preceding siblings ...)
2012-05-10 3:11 ` [Bcache v13 14/16] bcache: Request, io and allocation code Kent Overstreet
@ 2012-05-10 3:11 ` Kent Overstreet
2012-05-10 13:54 ` [Bcache v13 00/16] Vivek Goyal
` (2 subsequent siblings)
13 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 3:11 UTC (permalink / raw)
To: linux-bcache, linux-kernel, dm-devel; +Cc: tejun, agk
Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
drivers/block/bcache/writeback.c | 518 ++++++++++++++++++++++++++++++++++++++
1 files changed, 518 insertions(+), 0 deletions(-)
create mode 100644 drivers/block/bcache/writeback.c
diff --git a/drivers/block/bcache/writeback.c b/drivers/block/bcache/writeback.c
new file mode 100644
index 0000000..cfcfe52
--- /dev/null
+++ b/drivers/block/bcache/writeback.c
@@ -0,0 +1,518 @@
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+static struct workqueue_struct *dirty_wq;
+
+static void read_dirty(struct cached_dev *);
+
+/* Background writeback */
+
+static void dirty_init(struct dirty *w)
+{
+ struct bio *bio = &w->io->bio;
+
+ bio_init(bio);
+ bio_get(bio);
+ if (!w->io->d->writeback_percent)
+ bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+ bio->bi_size = KEY_SIZE(&w->key) << 9;
+ bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
+ bio->bi_private = w;
+ bio_map(bio, NULL);
+}
+
+static int dirty_cmp(struct dirty *r, struct dirty *l)
+{
+ /* Overlapping keys must compare equal */
+ if (KEY_START(&r->key) >= l->key.key)
+ return 1;
+ if (KEY_START(&l->key) >= r->key.key)
+ return -1;
+ return 0;
+}
+
+static int btree_refill_dirty_leaf(struct btree *b, struct btree_op *op,
+ struct cached_dev *dc)
+{
+ struct dirty *w;
+ struct btree_iter iter;
+ btree_iter_init(b, &iter, &KEY(op->d->id, dc->last_found, 0));
+
+ /* To protect rb tree access vs. read_dirty() */
+ spin_lock(&dc->dirty_lock);
+
+ while (!array_freelist_empty(&dc->dirty_freelist)) {
+ struct bkey *k = btree_iter_next(&iter);
+ if (!k || KEY_DEV(k) != op->d->id)
+ break;
+
+ if (ptr_bad(b, k))
+ continue;
+
+ if (KEY_DIRTY(k)) {
+ w = array_alloc(&dc->dirty_freelist);
+
+ dc->last_found = k->key;
+ pr_debug("%s", pkey(k));
+ w->io = NULL;
+ bkey_copy(&w->key, k);
+ SET_KEY_DIRTY(&w->key, false);
+
+ if (RB_INSERT(&dc->dirty, w, node, dirty_cmp))
+ array_free(&dc->dirty_freelist, w);
+ }
+ }
+
+ spin_unlock(&dc->dirty_lock);
+
+ return 0;
+}
+
+static int btree_refill_dirty(struct btree *b, struct btree_op *op,
+ struct cached_dev *dc)
+{
+ int r;
+ struct btree_iter iter;
+ btree_iter_init(b, &iter, &KEY(op->d->id, dc->last_found, 0));
+
+ if (!b->level)
+ return btree_refill_dirty_leaf(b, op, dc);
+
+ while (!array_freelist_empty(&dc->dirty_freelist)) {
+ struct bkey *k = btree_iter_next(&iter);
+ if (!k)
+ break;
+
+ if (ptr_bad(b, k))
+ continue;
+
+ r = btree(refill_dirty, k, b, op, dc);
+ if (r) {
+ char buf[BDEVNAME_SIZE];
+ bdevname(dc->bdev, buf);
+
+ printk(KERN_WARNING "Error trying to read the btree "
+ "for background writeback on %s: "
+ "dirty data may have been lost!\n", buf);
+ }
+
+ if (KEY_DEV(k) != op->d->id)
+ break;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static void refill_dirty(struct work_struct *work)
+{
+ struct cached_dev *dc = container_of(to_delayed_work(work),
+ struct cached_dev, refill_dirty);
+ uint64_t start;
+
+ struct btree_op op;
+ btree_op_init_stack(&op);
+ op.d = &dc->disk;
+
+ if (!atomic_read(&dc->disk.detaching) &&
+ !dc->writeback_running)
+ return;
+
+ down_write(&dc->writeback_lock);
+ start = dc->last_found;
+
+ if (!atomic_read(&dc->has_dirty)) {
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ write_bdev_super(dc, NULL);
+ up_write(&dc->writeback_lock);
+ return;
+ }
+
+ btree_root(refill_dirty, dc->disk.c, &op, dc);
+ closure_sync(&op.cl);
+
+ pr_debug("found %s keys on %i from %llu to %llu, %i%% used",
+ RB_EMPTY_ROOT(&dc->dirty) ? "no" :
+ array_freelist_empty(&dc->dirty_freelist) ? "some" : "a few",
+ dc->disk.id, start, (uint64_t) dc->last_found,
+ dc->disk.c->gc_stats.in_use);
+
+ /* Got to the end of the btree */
+ if (!array_freelist_empty(&dc->dirty_freelist))
+ dc->last_found = 0;
+
+ /* Searched the entire btree - delay for awhile */
+ if (!array_freelist_empty(&dc->dirty_freelist) && !start)
+ queue_delayed_work(dirty_wq, &dc->refill_dirty,
+ dc->writeback_delay * HZ);
+
+ spin_lock(&dc->dirty_lock);
+
+ if (!RB_EMPTY_ROOT(&dc->dirty)) {
+ struct dirty *w;
+ w = RB_FIRST(&dc->dirty, struct dirty, node);
+ dc->writeback_start = KEY_START(&w->key);
+
+ w = RB_LAST(&dc->dirty, struct dirty, node);
+ dc->writeback_end = w->key.key;
+ } else {
+ dc->writeback_start = 0;
+ dc->writeback_end = 0;
+
+ if (!start) {
+ atomic_set(&dc->has_dirty, 0);
+ cached_dev_put(dc);
+ }
+ }
+
+ up_write(&dc->writeback_lock);
+
+ dc->next_writeback_io = local_clock();
+ read_dirty(dc);
+}
+
+bool bcache_in_writeback(struct cached_dev *dc, sector_t offset, unsigned len)
+{
+ struct dirty *w, s;
+ s.key = KEY(dc->disk.id, offset + len, len);
+
+ if (offset >= dc->writeback_end ||
+ offset + len <= dc->writeback_start)
+ return false;
+
+ spin_lock(&dc->dirty_lock);
+ w = RB_SEARCH(&dc->dirty, s, node, dirty_cmp);
+ if (w && !w->io) {
+ rb_erase(&w->node, &dc->dirty);
+ array_free(&dc->dirty_freelist, w);
+ w = NULL;
+ }
+
+ spin_unlock(&dc->dirty_lock);
+ return w != NULL;
+}
+
+void bcache_writeback_queue(struct cached_dev *d)
+{
+ queue_delayed_work(dirty_wq, &d->refill_dirty, 0);
+}
+
+void bcache_writeback_add(struct cached_dev *d, unsigned sectors)
+{
+ atomic_long_add(sectors, &d->disk.sectors_dirty);
+
+ if (!atomic_read(&d->has_dirty) &&
+ !atomic_xchg(&d->has_dirty, 1)) {
+ if (BDEV_STATE(&d->sb) != BDEV_STATE_DIRTY) {
+ SET_BDEV_STATE(&d->sb, BDEV_STATE_DIRTY);
+ /* XXX: should do this synchronously */
+ write_bdev_super(d, NULL);
+ }
+
+ atomic_inc(&d->count);
+ queue_delayed_work(dirty_wq, &d->refill_dirty,
+ d->writeback_delay * HZ);
+
+ if (d->writeback_percent)
+ schedule_delayed_work(&d->writeback_rate_update,
+ d->writeback_rate_update_seconds * HZ);
+ }
+}
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+ struct cache_set *c = dc->disk.c;
+ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+ uint64_t cache_dirty_target =
+ div_u64(cache_sectors * dc->writeback_percent, 100);
+
+ int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
+ c->cached_dev_sectors);
+
+ /* PD controller */
+
+ int change = 0;
+ int64_t error;
+ int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
+ int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+
+ dc->disk.sectors_dirty_last = dirty;
+
+ derivative *= dc->writeback_rate_d_term;
+ derivative = clamp(derivative, -dirty, dirty);
+
+ derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+ dc->writeback_rate_d_smooth, 0);
+
+ /* Avoid divide by zero */
+ if (!target)
+ goto out;
+
+ error = div64_s64((dirty + derivative - target) << 8, target);
+
+ change = div_s64((dc->writeback_rate * error) >> 8,
+ dc->writeback_rate_p_term_inverse);
+
+ /* Don't increase writeback rate if the device isn't keeping up */
+ if (change > 0 &&
+ time_after64(local_clock(),
+ dc->next_writeback_io + 10 * NSEC_PER_MSEC))
+ change = 0;
+
+ dc->writeback_rate = clamp_t(int64_t, dc->writeback_rate + change,
+ 1, NSEC_PER_MSEC);
+out:
+ dc->writeback_rate_derivative = derivative;
+ dc->writeback_rate_change = change;
+ dc->writeback_rate_target = target;
+
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+ struct cached_dev *dc = container_of(to_delayed_work(work),
+ struct cached_dev,
+ writeback_rate_update);
+
+ down_read(&dc->writeback_lock);
+
+ if (atomic_read(&dc->has_dirty) &&
+ dc->writeback_percent)
+ __update_writeback_rate(dc);
+
+ up_read(&dc->writeback_lock);
+}
+
+static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+{
+ uint64_t now = local_clock();
+
+ if (atomic_read(&dc->disk.detaching) ||
+ !dc->writeback_percent)
+ return 0;
+
+ /* writeback_rate = sectors per 10 ms */
+ dc->next_writeback_io += div_u64(sectors * 10000000ULL,
+ dc->writeback_rate);
+
+ return time_after64(dc->next_writeback_io, now)
+ ? div_u64(dc->next_writeback_io - now, NSEC_PER_SEC / HZ)
+ : 0;
+}
+
+/* Background writeback - IO loop */
+
+static void write_dirty_finish(struct closure *cl)
+{
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ struct dirty *w = io->bio.bi_private;
+ struct cached_dev *dc = io->d;
+ struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+
+ while (bv-- != w->io->bio.bi_io_vec)
+ __free_page(bv->bv_page);
+
+ closure_debug_destroy(cl);
+ kfree(io);
+
+ /* This is kind of a dumb way of signalling errors. */
+ if (!KEY_DIRTY(&w->key)) {
+ struct btree_op op;
+ btree_op_init_stack(&op);
+
+ op.type = BTREE_REPLACE;
+ bkey_copy(&op.replace, &w->key);
+ SET_KEY_DIRTY(&op.replace, true);
+
+ keylist_add(&op.keys, &w->key);
+
+ for (unsigned i = 0; i < KEY_PTRS(&w->key); i++)
+ atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+
+ pr_debug("clearing %s", pkey(&w->key));
+ bcache_btree_insert(&op, dc->disk.c);
+ closure_sync(&op.cl);
+
+ atomic_long_inc(op.insert_collision
+ ? &dc->disk.c->writeback_keys_failed
+ : &dc->disk.c->writeback_keys_done);
+ }
+
+ spin_lock(&dc->dirty_lock);
+ rb_erase(&w->node, &dc->dirty);
+ array_free(&dc->dirty_freelist, w);
+ atomic_dec_bug(&dc->in_flight);
+
+ read_dirty(dc);
+}
+
+static void dirty_endio(struct bio *bio, int error)
+{
+ struct dirty *w = bio->bi_private;
+
+ if (error)
+ SET_KEY_DIRTY(&w->key, true);
+
+ bio_put(bio);
+ closure_put(&w->io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ struct dirty *w = io->bio.bi_private;
+
+ dirty_init(w);
+ io->bio.bi_rw = WRITE|REQ_UNPLUG;
+ io->bio.bi_sector = KEY_START(&w->key);
+ io->bio.bi_bdev = io->d->bdev;
+ io->bio.bi_end_io = dirty_endio;
+
+ trace_bcache_write_dirty(&w->io->bio);
+ closure_bio_submit(&w->io->bio, cl, io->d->disk.bio_split);
+
+ continue_at(&io->cl, write_dirty_finish, dirty_wq);
+}
+
+static void read_dirty_endio(struct bio *bio, int error)
+{
+ struct dirty *w = bio->bi_private;
+
+ count_io_errors(PTR_CACHE(w->io->d->disk.c, &w->key, 0),
+ error, "reading dirty data from cache");
+
+ dirty_endio(bio, error);
+}
+
+static void read_dirty(struct cached_dev *dc)
+{
+ unsigned delay = writeback_delay(dc, 0);
+ struct dirty *w;
+ struct dirty_io *io;
+
+ /* XXX: if we error, background writeback could stall indefinitely */
+
+ while (1) {
+ w = RB_FIRST(&dc->dirty, struct dirty, node);
+
+ while (w && w->io)
+ w = RB_NEXT(w, node);
+
+ if (!w)
+ break;
+
+ BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+
+ if (delay > 0 &&
+ (KEY_START(&w->key) != dc->last_read ||
+ jiffies_to_msecs(delay) > 50)) {
+ queue_delayed_work(dirty_wq, &dc->read_dirty, delay);
+ break;
+ }
+
+ dc->last_read = w->key.key;
+ w->io = ERR_PTR(-EINTR);
+ spin_unlock(&dc->dirty_lock);
+
+ io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
+ * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ w->io = io;
+ w->io->d = dc;
+
+ dirty_init(w);
+ w->io->bio.bi_sector = PTR_OFFSET(&w->key, 0);
+ w->io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
+ &w->key, 0)->bdev;
+ w->io->bio.bi_rw = READ|REQ_UNPLUG;
+ w->io->bio.bi_end_io = read_dirty_endio;
+
+ if (bio_alloc_pages(&w->io->bio, GFP_KERNEL))
+ goto err;
+
+ pr_debug("%s", pkey(&w->key));
+
+ closure_init(&w->io->cl, NULL);
+ set_closure_fn(&w->io->cl, write_dirty, dirty_wq);
+ closure_set_stopped(&w->io->cl);
+
+ trace_bcache_read_dirty(&w->io->bio);
+ closure_bio_submit_put(&w->io->bio, &w->io->cl,
+ dc->disk.bio_split);
+
+ delay = writeback_delay(dc, KEY_SIZE(&w->key));
+
+ if (atomic_inc_return(&dc->in_flight) >= 128)
+ return;
+
+ spin_lock(&dc->dirty_lock);
+ }
+
+ if (0) {
+err: spin_lock(&dc->dirty_lock);
+ if (!IS_ERR_OR_NULL(w->io))
+ kfree(w->io);
+ rb_erase(&w->node, &dc->dirty);
+ array_free(&dc->dirty_freelist, w);
+ }
+
+ if (RB_EMPTY_ROOT(&dc->dirty))
+ queue_delayed_work(dirty_wq, &dc->refill_dirty, 0);
+
+ spin_unlock(&dc->dirty_lock);
+}
+
+static void read_dirty_work(struct work_struct *work)
+{
+ struct cached_dev *dc = container_of(to_delayed_work(work),
+ struct cached_dev, read_dirty);
+
+ spin_lock(&dc->dirty_lock);
+ read_dirty(dc);
+}
+
+void bcache_writeback_init_cached_dev(struct cached_dev *d)
+{
+ INIT_DELAYED_WORK(&d->refill_dirty, refill_dirty);
+ INIT_DELAYED_WORK(&d->read_dirty, read_dirty_work);
+ init_rwsem(&d->writeback_lock);
+ array_allocator_init(&d->dirty_freelist);
+
+ d->dirty = RB_ROOT;
+ d->writeback_metadata = true;
+ d->writeback_running = true;
+ d->writeback_delay = 30;
+ d->writeback_rate = 1024;
+
+ d->writeback_rate_update_seconds = 30;
+ d->writeback_rate_d_term = 16;
+ d->writeback_rate_p_term_inverse = 64;
+ d->writeback_rate_d_smooth = 8;
+
+ INIT_DELAYED_WORK(&d->writeback_rate_update, update_writeback_rate);
+ schedule_delayed_work(&d->writeback_rate_update,
+ d->writeback_rate_update_seconds * HZ);
+}
+
+void bcache_writeback_exit(void)
+{
+ if (dirty_wq)
+ destroy_workqueue(dirty_wq);
+}
+
+int __init bcache_writeback_init(void)
+{
+ dirty_wq = create_singlethread_workqueue("bcache_writeback");
+ if (!dirty_wq)
+ return -ENOMEM;
+
+ return 0;
+}
--
1.7.9.rc2
^ permalink raw reply related [flat|nested] 87+ messages in thread
* Re: [Bcache v13 00/16]
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (10 preceding siblings ...)
2012-05-10 3:11 ` [Bcache v13 15/16] bcache: Writeback Kent Overstreet
@ 2012-05-10 13:54 ` Vivek Goyal
2012-05-10 15:03 ` [dm-devel] " Vivek Goyal
[not found] ` <1188908028.170.1336674698865.JavaMail.mail@webmail09>
13 siblings, 0 replies; 87+ messages in thread
From: Vivek Goyal @ 2012-05-10 13:54 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-bcache, agk, tejun, linux-kernel, dm-devel
On Wed, May 09, 2012 at 11:07:29PM -0400, Kent Overstreet wrote:
> bcache: a cache for arbitrary block devices using an SSD.
>
> Short overview:
> Bcache does both writethrough and writeback caching. It presents itself as a
> new block device, a bit like say md. You can cache an arbitrary number of
> block devices with a single cache device, and attach and detach things at
> runtime - it's quite flexible.
So it is still not a device mapper target. I somehow had the impression
that consensus at LSF was to convert it into a device mapper target.
Thanks
Vivek
^ permalink raw reply [flat|nested] 87+ messages in thread
* Re: [dm-devel] [Bcache v13 00/16]
2012-05-10 3:07 [Bcache v13 00/16] Kent Overstreet
` (11 preceding siblings ...)
2012-05-10 13:54 ` [Bcache v13 00/16] Vivek Goyal
@ 2012-05-10 15:03 ` Vivek Goyal
[not found] ` <20120510150353.GI23768-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
[not found] ` <1188908028.170.1336674698865.JavaMail.mail@webmail09>
13 siblings, 1 reply; 87+ messages in thread
From: Vivek Goyal @ 2012-05-10 15:03 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-bcache, linux-kernel, dm-devel, tejun, agk
On Wed, May 09, 2012 at 11:07:29PM -0400, Kent Overstreet wrote:
[..]
> The userspace interface is going to change before it goes in. The general
> consensus at LSF was that we don't want yet another interface for
> probing/managing block devices, and dm exists so we may as well use that. I
> don't think anyone's started on that yet, though.
>
Ok, I missed above lines and was pointed to it. So idea of this posting is
to review core changes and then somebody else needs to take the core, wrap
it with dm apis and repost?
Thanks
Vivek
^ permalink raw reply [flat|nested] 87+ messages in thread
[parent not found: <1188908028.170.1336674698865.JavaMail.mail@webmail09>]
* Re: [Bcache v13 11/16] bcache: Core btree code
[not found] ` <1188908028.170.1336674698865.JavaMail.mail@webmail09>
@ 2012-05-10 18:49 ` Joe Perches
2012-05-10 21:48 ` Kent Overstreet
0 siblings, 1 reply; 87+ messages in thread
From: Joe Perches @ 2012-05-10 18:49 UTC (permalink / raw)
To: Kent Overstreet; +Cc: linux-bcache, linux-kernel, dm-devel, tejun, agk
On Wed, 2012-05-09 at 23:10 -0400, Kent Overstreet wrote:
> Signed-off-by: Kent Overstreet <koverstreet@google.com>
[]
> +
> +void btree_read_done(struct closure *cl)
> +{
[]
> + if (b->written < btree_blocks(b))
> + bset_init_next(b);
> +
> + if (0) {
> +err: set_btree_node_io_error(b);
> + cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
> + err, PTR_BUCKET_NR(b->c, &b->key, 0),
> + index(i, b), i->keys);
> + }
Hi Kent
trivia: This if (0) is an exceedingly ugly style.
I'd much prefer:
if (foo)
bar();
goto exit;
err:
set_btree_node_io_error(b);
cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
err, PTR_BUCKET_NR(b->c, &b->key, 0),
index(i, b), i->keys);
exit:
etc...
^ permalink raw reply [flat|nested] 87+ messages in thread
* Re: [Bcache v13 11/16] bcache: Core btree code
2012-05-10 18:49 ` [Bcache v13 11/16] bcache: Core btree code Joe Perches
@ 2012-05-10 21:48 ` Kent Overstreet
0 siblings, 0 replies; 87+ messages in thread
From: Kent Overstreet @ 2012-05-10 21:48 UTC (permalink / raw)
To: Joe Perches
Cc: linux-bcache-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA, tejun-hpIqsD4AKlfQT0dZR+AlfA,
agk-H+wXaHxf7aLQT0dZR+AlfA
I don't feel strongly one way or the other about it, but I do think
it's more just a matter of taste - the if (0) is ugly, I'll certainly
grant you that but IMO it makes the weird control flow harder to miss,
and the indentation more or less matches the control flow.
But I haven't come up with a way of writing that I actually like, I
dislike them all almost equally.
On Thu, May 10, 2012 at 11:49 AM, Joe Perches <joe-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org> wrote:
> On Wed, 2012-05-09 at 23:10 -0400, Kent Overstreet wrote:
>> Signed-off-by: Kent Overstreet <koverstreet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> []
>> +
>> +void btree_read_done(struct closure *cl)
>> +{
> []
>> + if (b->written < btree_blocks(b))
>> + bset_init_next(b);
>> +
>> + if (0) {
>> +err: set_btree_node_io_error(b);
>> + cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
>> + err, PTR_BUCKET_NR(b->c, &b->key, 0),
>> + index(i, b), i->keys);
>> + }
>
> Hi Kent
>
> trivia: This if (0) is an exceedingly ugly style.
>
> I'd much prefer:
>
> if (foo)
> bar();
>
> goto exit;
>
> err:
> set_btree_node_io_error(b);
> cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
> err, PTR_BUCKET_NR(b->c, &b->key, 0),
> index(i, b), i->keys);
>
> exit:
> etc...
>
>
^ permalink raw reply [flat|nested] 87+ messages in thread