linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] bcache: add the deferred_flush IO processing path in the writeback mode
@ 2025-04-25  3:50 Zhou Jifeng
  2025-04-25  5:46 ` Coly Li
  0 siblings, 1 reply; 9+ messages in thread
From: Zhou Jifeng @ 2025-04-25  3:50 UTC (permalink / raw)
  To: colyli, kent.overstreet
  Cc: linux-bcache, linux-kernel, xiahua, dengwangbo, Zhou Jifeng

In some scenarios with high requirements for both data reliability and
write performance, the various cache modes of the current bcache cannot
fully match the requirements. deferred_flush aims to increase the
reliability of writeback write-back. And reduce the sending of PREFLUSH
requests to the backing device to enhance data security and dsync write
performance in wrieback mode.

deferred_flush supports three selectable modes:
none: do nothing (default )
normal: sequential I/O bypasses the cache disk
force: sequential I/O cannot bypass the cache disk

Signed-off-by: Zhou Jifeng <zhoujifeng@kylinos.com.cn>
---
 drivers/md/bcache/bcache.h        |  6 ++++
 drivers/md/bcache/bcache_ondisk.h |  5 +++
 drivers/md/bcache/request.c       | 32 ++++++++++++++++--
 drivers/md/bcache/sysfs.c         | 54 +++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.c     |  7 ++++
 drivers/md/bcache/writeback.h     |  4 +++
 6 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 785b0d9008fa..d2654c449d1c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -405,6 +405,12 @@ struct cached_dev {
 	 */
 #define BCH_WBRATE_UPDATE_MAX_SKIPS	15
 	unsigned int		rate_update_retry;
+
+	/*
+	 * In the deferred flush mode, 0 indicates that there is no
+	 * need to send flush to the backing device.
+	 */
+	atomic_t		need_flush;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
index 6620a7f8fffc..822dcdc0caaf 100644
--- a/drivers/md/bcache/bcache_ondisk.h
+++ b/drivers/md/bcache/bcache_ondisk.h
@@ -294,6 +294,11 @@ BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
 #define CACHE_MODE_WRITEBACK		1U
 #define CACHE_MODE_WRITEAROUND		2U
 #define CACHE_MODE_NONE			3U
+BITMASK(BDEV_DEFERRED_FLUSH,		struct cache_sb, flags, 4, 3);
+#define DEFERRED_FLUSH_NONE		0U
+#define DEFERRED_FLUSH_NORMAL		1U
+#define DEFERRED_FLUSH_FORCE		2U
+
 BITMASK(BDEV_STATE,			struct cache_sb, flags, 61, 2);
 #define BDEV_STATE_NONE			0U
 #define BDEV_STATE_CLEAN		1U
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index af345dc6fde1..8dc17d9c5f75 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1026,16 +1026,28 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		bio->bi_end_io = backing_request_endio;
 		closure_bio_submit(s->iop.c, bio, cl);
 
+		if (BDEV_DEFERRED_FLUSH(&dc->sb))
+			atomic_set(&dc->need_flush, 1);
+
 	} else if (s->iop.writeback) {
 		bch_writeback_add(dc);
 		s->iop.bio = bio;
 
 		if (bio->bi_opf & REQ_PREFLUSH) {
+			struct bio *flush;
+
+			/*
+			 * When DEFERRED_FLUSH is enabled, if need_flush is 0,
+			 * there is no need to send a flush to the backing device.
+			 */
+			if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
+				 (!atomic_cmpxchg(&dc->need_flush, 1, 0)))
+				goto insert_data;
+
 			/*
 			 * Also need to send a flush to the backing
 			 * device.
 			 */
-			struct bio *flush;
 
 			flush = bio_alloc_bioset(bio->bi_bdev, 0,
 						 REQ_OP_WRITE | REQ_PREFLUSH,
@@ -1050,6 +1062,9 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(s->iop.c, flush, cl);
 		}
 	} else {
+		if (BDEV_DEFERRED_FLUSH(&dc->sb))
+			atomic_set(&dc->need_flush, 1);
+
 		s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
 					     &dc->disk.bio_split);
 		/* I/O request sent to backing device */
@@ -1066,14 +1081,27 @@ static CLOSURE_CALLBACK(cached_dev_nodata)
 {
 	closure_type(s, struct search, cl);
 	struct bio *bio = &s->bio.bio;
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	if (s->iop.flush_journal)
+	if (s->iop.flush_journal) {
 		bch_journal_meta(s->iop.c, cl);
 
+		/*
+		 * When deferred flush is enabled, it is necessary to determine
+		 * whether the flush request can be sent to the backing device.
+		 */
+		if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
+				 (!atomic_cmpxchg(&dc->need_flush, 1, 0))) {
+			s->iop.status = BLK_STS_OK;
+			goto end;
+		}
+	}
+
 	/* If it's a flush, we send the flush to the backing device too */
 	bio->bi_end_io = backing_request_endio;
 	closure_bio_submit(s->iop.c, bio, cl);
 
+end:
 	continue_at(cl, cached_dev_bio_complete, NULL);
 }
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e8f696cb58c0..3f343fba2f96 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -28,6 +28,25 @@ static const char * const bch_cache_modes[] = {
 	NULL
 };
 
+/*
+ * Deferred flush: In writeback mode, reduce unnecessary PREFLUSH
+ * passed to the backend disk to speed up the performance of dsync
+ * requests.Enhance data writeback security through FUA when dirty
+ * data is written back
+ *
+ * Default is 0 ("none")
+ * none: Do nothing
+ * normal: Sequential I/O bypasses the cache disk
+ * force: Sequential I/O cannot bypass the cache disk
+ */
+static const char * const bch_deferred_flush[] = {
+	"none",
+	"normal",
+	"force",
+	NULL
+};
+
+
 static const char * const bch_reada_cache_policies[] = {
 	"all",
 	"meta-only",
@@ -151,6 +170,7 @@ rw_attribute(copy_gc_enabled);
 rw_attribute(idle_max_writeback_rate);
 rw_attribute(gc_after_writeback);
 rw_attribute(size);
+rw_attribute(deferred_flush);
 
 static ssize_t bch_snprint_string_list(char *buf,
 				       size_t size,
@@ -283,6 +303,11 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_deferred_flush)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_deferred_flush,
+					       BDEV_DEFERRED_FLUSH(&dc->sb));
+
 #undef var
 	return 0;
 }
@@ -295,6 +320,7 @@ STORE(__cached_dev)
 	ssize_t v;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	struct bio flush;
 
 	/* no user space access if system is rebooting */
 	if (bcache_is_reboot)
@@ -383,6 +409,12 @@ STORE(__cached_dev)
 			SET_BDEV_CACHE_MODE(&dc->sb, v);
 			bch_write_bdev_super(dc, NULL);
 		}
+
+		/* It's not the writeback mode that can't enable deferred_flush */
+		if (BDEV_DEFERRED_FLUSH(&dc->sb) && ((unsigned int) v != CACHE_MODE_WRITEBACK)) {
+			SET_BDEV_DEFERRED_FLUSH(&dc->sb, 0);
+			bch_write_bdev_super(dc, NULL);
+		}
 	}
 
 	if (attr == &sysfs_readahead_cache_policy) {
@@ -451,6 +483,27 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	if (attr == &sysfs_deferred_flush) {
+		v = __sysfs_match_string(bch_deferred_flush, -1, buf);
+		if (v < 0)
+			return v;
+
+		if ((unsigned int) v != BDEV_DEFERRED_FLUSH(&dc->sb)) {
+			if (v && (BDEV_CACHE_MODE(&dc->sb) != CACHE_MODE_WRITEBACK)) {
+				pr_err("It's not the writeback mode that can't enable deferred_flush.\n");
+				return -EINVAL;
+			}
+
+			SET_BDEV_DEFERRED_FLUSH(&dc->sb, v);
+			bch_write_bdev_super(dc, NULL);
+			if (v) {
+				bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+				/* I/O request sent to backing device */
+				submit_bio_wait(&flush);
+			}
+		}
+	}
+
 	return size;
 }
 
@@ -541,6 +594,7 @@ static struct attribute *bch_cached_dev_attrs[] = {
 #endif
 	&sysfs_backing_dev_name,
 	&sysfs_backing_dev_uuid,
+	&sysfs_deferred_flush,
 	NULL
 };
 ATTRIBUTE_GROUPS(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 453efbbdc8ee..68bf655f3b96 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -435,6 +435,13 @@ static CLOSURE_CALLBACK(write_dirty)
 	if (KEY_DIRTY(&w->key)) {
 		dirty_init(w);
 		io->bio.bi_opf = REQ_OP_WRITE;
+
+		/* When DEFERRED_FLUSH is enabled, you need to ensure that
+		 * data is flushed to disk.
+		 */
+		if (BDEV_DEFERRED_FLUSH(&dc->sb))
+			io->bio.bi_opf |= REQ_FUA | REQ_SYNC | REQ_PREFLUSH;
+
 		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
 		bio_set_dev(&io->bio, io->dc->bdev);
 		io->bio.bi_end_io	= dirty_endio;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 31df716951f6..0c92a607a875 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -117,6 +117,10 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    bio_sectors(bio)))
 		return true;
 
+	/* Prevent IO from bypassing the cache disk */
+	if (BDEV_DEFERRED_FLUSH(&dc->sb) == DEFERRED_FLUSH_FORCE)
+		return true;
+
 	if (would_skip)
 		return false;
 
-- 
2.18.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-04-25  3:50 [PATCH] bcache: add the deferred_flush IO processing path in the writeback mode Zhou Jifeng
@ 2025-04-25  5:46 ` Coly Li
  2025-04-25  8:18   ` Zhou Jifeng
  2025-04-28  7:34   ` [PATCH v2] " Zhou Jifeng
  0 siblings, 2 replies; 9+ messages in thread
From: Coly Li @ 2025-04-25  5:46 UTC (permalink / raw)
  To: Zhou Jifeng
  Cc: kent.overstreet, linux-bcache, linux-kernel, xiahua, dengwangbo

Hi Jifeng,

Thanks for posting the patch.

On Fri, Apr 25, 2025 at 11:50:21AM +0800, Zhou Jifeng wrote:
> In some scenarios with high requirements for both data reliability and
> write performance, the various cache modes of the current bcache cannot

Could you provide the detail workload or circumstance which requires both
data reliability and write performance that current bcache cannot serve?


> fully match the requirements. deferred_flush aims to increase the
> reliability of writeback write-back. And reduce the sending of PREFLUSH
> requests to the backing device to enhance data security and dsync write
> performance in wrieback mode.

I'd like to see the detailed description on how deferred flush is defined,
and how it works. And why deferred flush may provide the data reliability
and performance better than current bcache code.

And a explicit and clear benchmakr for general workload is quite helpful
for me to understand your idea and how it works better.

I don't look into the patch yet, just with my intuition the overall
performance won't be quite optimized by setting FUA on writeback I/Os.

And the cache mode can swtich arbitarily in run time, if cache mode was none
or writethough, then switch to writeback, I don't see your patch handles
such situation.

Anyway, for performance optimization patch, an explicit markbench is
helpful. If the result is ideal, I'd like to reproduce it on my side.

Thanks.

Coly Li


 
> deferred_flush supports three selectable modes:
> none: do nothing (default )
> normal: sequential I/O bypasses the cache disk
> force: sequential I/O cannot bypass the cache disk
> 
> Signed-off-by: Zhou Jifeng <zhoujifeng@kylinos.com.cn>
> ---
>  drivers/md/bcache/bcache.h        |  6 ++++
>  drivers/md/bcache/bcache_ondisk.h |  5 +++
>  drivers/md/bcache/request.c       | 32 ++++++++++++++++--
>  drivers/md/bcache/sysfs.c         | 54 +++++++++++++++++++++++++++++++
>  drivers/md/bcache/writeback.c     |  7 ++++
>  drivers/md/bcache/writeback.h     |  4 +++
>  6 files changed, 106 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 785b0d9008fa..d2654c449d1c 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -405,6 +405,12 @@ struct cached_dev {
>  	 */
>  #define BCH_WBRATE_UPDATE_MAX_SKIPS	15
>  	unsigned int		rate_update_retry;
> +
> +	/*
> +	 * In the deferred flush mode, 0 indicates that there is no
> +	 * need to send flush to the backing device.
> +	 */
> +	atomic_t		need_flush;
>  };
>  
>  enum alloc_reserve {
> diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
> index 6620a7f8fffc..822dcdc0caaf 100644
> --- a/drivers/md/bcache/bcache_ondisk.h
> +++ b/drivers/md/bcache/bcache_ondisk.h
> @@ -294,6 +294,11 @@ BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
>  #define CACHE_MODE_WRITEBACK		1U
>  #define CACHE_MODE_WRITEAROUND		2U
>  #define CACHE_MODE_NONE			3U
> +BITMASK(BDEV_DEFERRED_FLUSH,		struct cache_sb, flags, 4, 3);
> +#define DEFERRED_FLUSH_NONE		0U
> +#define DEFERRED_FLUSH_NORMAL		1U
> +#define DEFERRED_FLUSH_FORCE		2U
> +
>  BITMASK(BDEV_STATE,			struct cache_sb, flags, 61, 2);
>  #define BDEV_STATE_NONE			0U
>  #define BDEV_STATE_CLEAN		1U
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index af345dc6fde1..8dc17d9c5f75 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1026,16 +1026,28 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
>  		bio->bi_end_io = backing_request_endio;
>  		closure_bio_submit(s->iop.c, bio, cl);
>  
> +		if (BDEV_DEFERRED_FLUSH(&dc->sb))
> +			atomic_set(&dc->need_flush, 1);
> +
>  	} else if (s->iop.writeback) {
>  		bch_writeback_add(dc);
>  		s->iop.bio = bio;
>  
>  		if (bio->bi_opf & REQ_PREFLUSH) {
> +			struct bio *flush;
> +
> +			/*
> +			 * When DEFERRED_FLUSH is enabled, if need_flush is 0,
> +			 * there is no need to send a flush to the backing device.
> +			 */
> +			if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
> +				 (!atomic_cmpxchg(&dc->need_flush, 1, 0)))
> +				goto insert_data;
> +
>  			/*
>  			 * Also need to send a flush to the backing
>  			 * device.
>  			 */
> -			struct bio *flush;
>  
>  			flush = bio_alloc_bioset(bio->bi_bdev, 0,
>  						 REQ_OP_WRITE | REQ_PREFLUSH,
> @@ -1050,6 +1062,9 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
>  			closure_bio_submit(s->iop.c, flush, cl);
>  		}
>  	} else {
> +		if (BDEV_DEFERRED_FLUSH(&dc->sb))
> +			atomic_set(&dc->need_flush, 1);
> +
>  		s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
>  					     &dc->disk.bio_split);
>  		/* I/O request sent to backing device */
> @@ -1066,14 +1081,27 @@ static CLOSURE_CALLBACK(cached_dev_nodata)
>  {
>  	closure_type(s, struct search, cl);
>  	struct bio *bio = &s->bio.bio;
> +	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
>  
> -	if (s->iop.flush_journal)
> +	if (s->iop.flush_journal) {
>  		bch_journal_meta(s->iop.c, cl);
>  
> +		/*
> +		 * When deferred flush is enabled, it is necessary to determine
> +		 * whether the flush request can be sent to the backing device.
> +		 */
> +		if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
> +				 (!atomic_cmpxchg(&dc->need_flush, 1, 0))) {
> +			s->iop.status = BLK_STS_OK;
> +			goto end;
> +		}
> +	}
> +
>  	/* If it's a flush, we send the flush to the backing device too */
>  	bio->bi_end_io = backing_request_endio;
>  	closure_bio_submit(s->iop.c, bio, cl);
>  
> +end:
>  	continue_at(cl, cached_dev_bio_complete, NULL);
>  }
>  
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index e8f696cb58c0..3f343fba2f96 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -28,6 +28,25 @@ static const char * const bch_cache_modes[] = {
>  	NULL
>  };
>  
> +/*
> + * Deferred flush: In writeback mode, reduce unnecessary PREFLUSH
> + * passed to the backend disk to speed up the performance of dsync
> + * requests.Enhance data writeback security through FUA when dirty
> + * data is written back
> + *
> + * Default is 0 ("none")
> + * none: Do nothing
> + * normal: Sequential I/O bypasses the cache disk
> + * force: Sequential I/O cannot bypass the cache disk
> + */
> +static const char * const bch_deferred_flush[] = {
> +	"none",
> +	"normal",
> +	"force",
> +	NULL
> +};
> +
> +
>  static const char * const bch_reada_cache_policies[] = {
>  	"all",
>  	"meta-only",
> @@ -151,6 +170,7 @@ rw_attribute(copy_gc_enabled);
>  rw_attribute(idle_max_writeback_rate);
>  rw_attribute(gc_after_writeback);
>  rw_attribute(size);
> +rw_attribute(deferred_flush);
>  
>  static ssize_t bch_snprint_string_list(char *buf,
>  				       size_t size,
> @@ -283,6 +303,11 @@ SHOW(__bch_cached_dev)
>  		return strlen(buf);
>  	}
>  
> +	if (attr == &sysfs_deferred_flush)
> +		return bch_snprint_string_list(buf, PAGE_SIZE,
> +					       bch_deferred_flush,
> +					       BDEV_DEFERRED_FLUSH(&dc->sb));
> +
>  #undef var
>  	return 0;
>  }
> @@ -295,6 +320,7 @@ STORE(__cached_dev)
>  	ssize_t v;
>  	struct cache_set *c;
>  	struct kobj_uevent_env *env;
> +	struct bio flush;
>  
>  	/* no user space access if system is rebooting */
>  	if (bcache_is_reboot)
> @@ -383,6 +409,12 @@ STORE(__cached_dev)
>  			SET_BDEV_CACHE_MODE(&dc->sb, v);
>  			bch_write_bdev_super(dc, NULL);
>  		}
> +
> +		/* It's not the writeback mode that can't enable deferred_flush */
> +		if (BDEV_DEFERRED_FLUSH(&dc->sb) && ((unsigned int) v != CACHE_MODE_WRITEBACK)) {
> +			SET_BDEV_DEFERRED_FLUSH(&dc->sb, 0);
> +			bch_write_bdev_super(dc, NULL);
> +		}
>  	}
>  
>  	if (attr == &sysfs_readahead_cache_policy) {
> @@ -451,6 +483,27 @@ STORE(__cached_dev)
>  	if (attr == &sysfs_stop)
>  		bcache_device_stop(&dc->disk);
>  
> +	if (attr == &sysfs_deferred_flush) {
> +		v = __sysfs_match_string(bch_deferred_flush, -1, buf);
> +		if (v < 0)
> +			return v;
> +
> +		if ((unsigned int) v != BDEV_DEFERRED_FLUSH(&dc->sb)) {
> +			if (v && (BDEV_CACHE_MODE(&dc->sb) != CACHE_MODE_WRITEBACK)) {
> +				pr_err("It's not the writeback mode that can't enable deferred_flush.\n");
> +				return -EINVAL;
> +			}
> +
> +			SET_BDEV_DEFERRED_FLUSH(&dc->sb, v);
> +			bch_write_bdev_super(dc, NULL);
> +			if (v) {
> +				bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
> +				/* I/O request sent to backing device */
> +				submit_bio_wait(&flush);
> +			}
> +		}
> +	}
> +
>  	return size;
>  }
>  
> @@ -541,6 +594,7 @@ static struct attribute *bch_cached_dev_attrs[] = {
>  #endif
>  	&sysfs_backing_dev_name,
>  	&sysfs_backing_dev_uuid,
> +	&sysfs_deferred_flush,
>  	NULL
>  };
>  ATTRIBUTE_GROUPS(bch_cached_dev);
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 453efbbdc8ee..68bf655f3b96 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -435,6 +435,13 @@ static CLOSURE_CALLBACK(write_dirty)
>  	if (KEY_DIRTY(&w->key)) {
>  		dirty_init(w);
>  		io->bio.bi_opf = REQ_OP_WRITE;
> +
> +		/* When DEFERRED_FLUSH is enabled, you need to ensure that
> +		 * data is flushed to disk.
> +		 */
> +		if (BDEV_DEFERRED_FLUSH(&dc->sb))
> +			io->bio.bi_opf |= REQ_FUA | REQ_SYNC | REQ_PREFLUSH;
> +
>  		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
>  		bio_set_dev(&io->bio, io->dc->bdev);
>  		io->bio.bi_end_io	= dirty_endio;
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index 31df716951f6..0c92a607a875 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -117,6 +117,10 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
>  				    bio_sectors(bio)))
>  		return true;
>  
> +	/* Prevent IO from bypassing the cache disk */
> +	if (BDEV_DEFERRED_FLUSH(&dc->sb) == DEFERRED_FLUSH_FORCE)
> +		return true;
> +
>  	if (would_skip)
>  		return false;
>  
> -- 
> 2.18.1
> 
> 

-- 
Coly Li

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-04-25  5:46 ` Coly Li
@ 2025-04-25  8:18   ` Zhou Jifeng
  2025-04-27  6:47     ` Coly Li
  2025-04-28  7:34   ` [PATCH v2] " Zhou Jifeng
  1 sibling, 1 reply; 9+ messages in thread
From: Zhou Jifeng @ 2025-04-25  8:18 UTC (permalink / raw)
  To: Coly Li
  Cc: kent.overstreet, linux-bcache, linux-kernel, 夏华,
	邓旺波

Hi Coly Li,
Thank you for your reply and your question.

On Fri, 25 Apr 2025 at 13:46, Coly Li <colyli@kernel.org> wrote:
>
> Hi Jifeng,
>
> Thanks for posting the patch.
>
> On Fri, Apr 25, 2025 at 11:50:21AM +0800, Zhou Jifeng wrote:
> > In some scenarios with high requirements for both data reliability and
> > write performance, the various cache modes of the current bcache cannot
>
> Could you provide the detail workload or circumstance which requires both
> data reliability and write performance that current bcache cannot serve?

For example, in some database application scenarios, the requirements for data
security are relatively high. When writing frequently, flush is called more often,
and the performance of write dsync is of great concern. The operational performance
of several cache modes of bcache in such scenarios at present:
none: The cache does not work and is of no help to performance. The performance is
the same as that of the backing device and cannot meet the performance requirements.
writeround and writethrough: They are not helpful for write performance. The write
performance is the same as that of the backing device and cannot meet the write
performance requirements.
writeback: Since when it writes back dirty data, it only marks bio as REQ_OP_WRITE,
there is a risk of data loss due to power failure. In addition, since it needs to send a
flush request to the backing device when handling requests with the flush mark, it will
affect the write performance.

> > fully match the requirements. deferred_flush aims to increase the
> > reliability of writeback write-back. And reduce the sending of PREFLUSH
> > requests to the backing device to enhance data security and dsync write
> > performance in wrieback mode.
>
> I'd like to see the detailed description on how deferred flush is defined,
> and how it works. And why deferred flush may provide the data reliability
> and performance better than current bcache code.

deferred flush: When data is processed through the writeback path, it will determine
whether a PREFLUSH needs to be sent to the backing device. The judgment criterion
is whether a write request has been sent through bypass or writethrough before. If not,
it is not necessary. Put the PREFLUSH semantics into the dirty data write-back stage
to ensure the reliability of the dirty data write-back. Here, by reducing the sending of
PRELUSH to the backing device, the delay for the backing device to process PRELUSH
is decreased, thereby improving the performance of dsync write requests when the
cache space is abundant. During the dirty data write-back stage, the FUA method is
adopted to ensure that the dirty data will not be lost due to factors such as power failure.

> I don't look into the patch yet, just with my intuition the overall
> performance won't be quite optimized by setting FUA on writeback I/Os.

Using the FUA method to write back dirty data does indeed have an impact on the speed
of writing back dirty data. In a test where I/O is written randomly at 4K, the speed of
writing back dirty data is approximately half that of the non-FUA method. However,
considering that the data is not written at a high intensity continuously, this provides some
buffer time for writing back dirty data. In extreme cases, when the effective space of the
cache is tight, its write efficiency is not lower than the performance of the backing device.
Therefore, enabling deferred_flush is effective in low-cost deployment solutions that require
the use of SSD to accelerate the performance of dsync.

> And the cache mode can swtich arbitarily in run time, if cache mode was none
> or writethough, then switch to writeback, I don't see your patch handles
> such situation.

When switching from other cache modes to writeback and simultaneously enabling 
deferred_flush, a REQ_PREFLUSH request will be sent to the backing device.
Code location in the patch:
+   if (attr == &sysfs_deferred_flush) {
+       v = __sysfs_match_string(bch_deferred_flush, -1, buf);
+       if (v < 0)
+           return v;
+
+       if ((unsigned int) v != BDEV_DEFERRED_FLUSH(&dc->sb)) {
+           if (v && (BDEV_CACHE_MODE(&dc->sb) != CACHE_MODE_WRITEBACK)) {
+               pr_err("It's not the writeback mode that can't enable deferred_flush.\n");
+               return -EINVAL;
+           }
+
+           SET_BDEV_DEFERRED_FLUSH(&dc->sb, v);
+           bch_write_bdev_super(dc, NULL);
+           if (v) {
+               bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+               /* I/O request sent to backing device */
+               submit_bio_wait(&flush);
+           }
+       }
+   }

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-04-25  8:18   ` Zhou Jifeng
@ 2025-04-27  6:47     ` Coly Li
  0 siblings, 0 replies; 9+ messages in thread
From: Coly Li @ 2025-04-27  6:47 UTC (permalink / raw)
  To: Zhou Jifeng
  Cc: Coly Li, kent.overstreet, linux-bcache, linux-kernel,
	夏华, 邓旺波



> 2025年4月25日 16:18,Zhou Jifeng <zhoujifeng@kylinos.com.cn> 写道:
> 
> Hi Coly Li,
> Thank you for your reply and your question.
> 
> On Fri, 25 Apr 2025 at 13:46, Coly Li <colyli@kernel.org> wrote:
>> 
>> Hi Jifeng,
>> 
>> Thanks for posting the patch.
>> 
>> On Fri, Apr 25, 2025 at 11:50:21AM +0800, Zhou Jifeng wrote:
>>> In some scenarios with high requirements for both data reliability and
>>> write performance, the various cache modes of the current bcache cannot
>> 
>> Could you provide the detail workload or circumstance which requires both
>> data reliability and write performance that current bcache cannot serve?
> 
> For example, in some database application scenarios, the requirements for data
> security are relatively high. When writing frequently, flush is called more often,
> and the performance of write dsync is of great concern. The operational performance
> of several cache modes of bcache in such scenarios at present:
> none: The cache does not work and is of no help to performance. The performance is
> the same as that of the backing device and cannot meet the performance requirements.
> writeround and writethrough: They are not helpful for write performance. The write
> performance is the same as that of the backing device and cannot meet the write
> performance requirements.
> writeback: Since when it writes back dirty data, it only marks bio as REQ_OP_WRITE,
> there is a risk of data loss due to power failure. In addition, since it needs to send a
> flush request to the backing device when handling requests with the flush mark, it will
> affect the write performance.
> 
>>> fully match the requirements. deferred_flush aims to increase the
>>> reliability of writeback write-back. And reduce the sending of PREFLUSH
>>> requests to the backing device to enhance data security and dsync write
>>> performance in wrieback mode.
>> 
>> I'd like to see the detailed description on how deferred flush is defined,
>> and how it works. And why deferred flush may provide the data reliability
>> and performance better than current bcache code.
> 
> deferred flush: When data is processed through the writeback path, it will determine
> whether a PREFLUSH needs to be sent to the backing device. The judgment criterion
> is whether a write request has been sent through bypass or writethrough before. If not,
> it is not necessary. Put the PREFLUSH semantics into the dirty data write-back stage
> to ensure the reliability of the dirty data write-back. Here, by reducing the sending of
> PRELUSH to the backing device, the delay for the backing device to process PRELUSH
> is decreased, thereby improving the performance of dsync write requests when the
> cache space is abundant. During the dirty data write-back stage, the FUA method is
> adopted to ensure that the dirty data will not be lost due to factors such as power failure.
> 
>> I don't look into the patch yet, just with my intuition the overall
>> performance won't be quite optimized by setting FUA on writeback I/Os.
> 
> Using the FUA method to write back dirty data does indeed have an impact on the speed
> of writing back dirty data. In a test where I/O is written randomly at 4K, the speed of
> writing back dirty data is approximately half that of the non-FUA method. However,
> considering that the data is not written at a high intensity continuously, this provides some
> buffer time for writing back dirty data. In extreme cases, when the effective space of the
> cache is tight, its write efficiency is not lower than the performance of the backing device.
> Therefore, enabling deferred_flush is effective in low-cost deployment solutions that require
> the use of SSD to accelerate the performance of dsync.

I am not sure whether the situation you stated is acceptable or not for most of users.

I hope to see more testing data.

> 
>> And the cache mode can swtich arbitarily in run time, if cache mode was none
>> or writethough, then switch to writeback, I don't see your patch handles
>> such situation.
> 
> When switching from other cache modes to writeback and simultaneously enabling 
> deferred_flush, a REQ_PREFLUSH request will be sent to the backing device.
> Code location in the patch:
> +   if (attr == &sysfs_deferred_flush) {
> +       v = __sysfs_match_string(bch_deferred_flush, -1, buf);
> +       if (v < 0)
> +           return v;
> +
> +       if ((unsigned int) v != BDEV_DEFERRED_FLUSH(&dc->sb)) {
> +           if (v && (BDEV_CACHE_MODE(&dc->sb) != CACHE_MODE_WRITEBACK)) {
> +               pr_err("It's not the writeback mode that can't enable deferred_flush.\n");
> +               return -EINVAL;
> +           }
> +
> +           SET_BDEV_DEFERRED_FLUSH(&dc->sb, v);
> +           bch_write_bdev_super(dc, NULL);
> +           if (v) {
> +               bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
> +               /* I/O request sent to backing device */
> +               submit_bio_wait(&flush);
> +           }
> +       }
> +   }

And when read/write congestion happen, part of read/write requests will directly from/to hard drive and bypass cache device.

Anyway, long time high load testing result is necessary. I assume this patch will decrease general write back throughput and result higher pressure for cache device garbage collection load triggered by allocator.
Maybe I am wrong, I’d like to learn from your benchmark results.

Thanks.

Coly Li

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-04-25  5:46 ` Coly Li
  2025-04-25  8:18   ` Zhou Jifeng
@ 2025-04-28  7:34   ` Zhou Jifeng
  2025-05-26  6:41     ` Zhou Jifeng
  1 sibling, 1 reply; 9+ messages in thread
From: Zhou Jifeng @ 2025-04-28  7:34 UTC (permalink / raw)
  To: colyli
  Cc: dengwangbo, kent.overstreet, linux-bcache, linux-kernel, xiahua,
	zhoujifeng

In some scenarios with high requirements for both data reliability and
write performance, the various cache modes of the current bcache cannot
fully match the requirements. deferred_flush aims to increase the
reliability of writeback write-back. And reduce the sending of PREFLUSH
requests to the backing device to enhance data security and dsync write
performance in wrieback mode.

When cache_mode is switched to the non-writeback mode, defered_flush
will be automatically turned off. When other modes are switched to
writeback+defered_flush, a PREFLUSH request will be sent to the backing
device. Make sure that the previously submitted data is not lost.

deferred_flush supports three selectable modes:
none: do nothing (default )
normal: sequential I/O bypasses the cache disk
force: sequential I/O cannot bypass the cache disk

Signed-off-by: Zhou Jifeng <zhoujifeng@kylinos.com.cn>
---

v1->v2: Version v2 mainly addresses the issue of low efficiency in 
writing back dirty data in version v1. When writing back dirty data, 
it no longer uses the FUA method but instead writes back no more than 
500 dirty bkeys and then uniformly sends a PREFLUSH instruction once.
I will supplement more test data later.

 drivers/md/bcache/bcache.h        | 20 +++++++
 drivers/md/bcache/bcache_ondisk.h |  5 ++
 drivers/md/bcache/request.c       | 32 +++++++++-
 drivers/md/bcache/sysfs.c         | 54 +++++++++++++++++
 drivers/md/bcache/writeback.c     | 98 +++++++++++++++++++++++++++----
 drivers/md/bcache/writeback.h     |  4 ++
 6 files changed, 199 insertions(+), 14 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 785b0d9008fa..75110fbe6656 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -247,6 +247,14 @@ struct keybuf {
 	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
 };
 
+struct keybuf_preflush {
+	spinlock_t	lock;
+#define KEYBUF_NR		500
+	struct keybuf_key data[KEYBUF_NR];
+	unsigned int nr_keys;
+};
+
+
 struct bcache_device {
 	struct closure		cl;
 
@@ -346,6 +354,12 @@ struct cached_dev {
 
 	struct keybuf		writeback_keys;
 
+	/*
+	 * Before performing preflush to the backing device, temporarily
+	 * store the bkey waiting to clean up the dirty mark
+	 */
+	struct keybuf_preflush  preflush_keys;
+
 	struct task_struct	*status_update_thread;
 	/*
 	 * Order the write-half of writeback operations strongly in dispatch
@@ -405,6 +419,12 @@ struct cached_dev {
 	 */
 #define BCH_WBRATE_UPDATE_MAX_SKIPS	15
 	unsigned int		rate_update_retry;
+
+	/*
+	 * In the deferred flush mode, 0 indicates that there is no
+	 * need to send flush to the backing device.
+	 */
+	atomic_t		need_flush;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
index 6620a7f8fffc..822dcdc0caaf 100644
--- a/drivers/md/bcache/bcache_ondisk.h
+++ b/drivers/md/bcache/bcache_ondisk.h
@@ -294,6 +294,11 @@ BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
 #define CACHE_MODE_WRITEBACK		1U
 #define CACHE_MODE_WRITEAROUND		2U
 #define CACHE_MODE_NONE			3U
+BITMASK(BDEV_DEFERRED_FLUSH,		struct cache_sb, flags, 4, 3);
+#define DEFERRED_FLUSH_NONE		0U
+#define DEFERRED_FLUSH_NORMAL		1U
+#define DEFERRED_FLUSH_FORCE		2U
+
 BITMASK(BDEV_STATE,			struct cache_sb, flags, 61, 2);
 #define BDEV_STATE_NONE			0U
 #define BDEV_STATE_CLEAN		1U
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index af345dc6fde1..8dc17d9c5f75 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1026,16 +1026,28 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		bio->bi_end_io = backing_request_endio;
 		closure_bio_submit(s->iop.c, bio, cl);
 
+		if (BDEV_DEFERRED_FLUSH(&dc->sb))
+			atomic_set(&dc->need_flush, 1);
+
 	} else if (s->iop.writeback) {
 		bch_writeback_add(dc);
 		s->iop.bio = bio;
 
 		if (bio->bi_opf & REQ_PREFLUSH) {
+			struct bio *flush;
+
+			/*
+			 * When DEFERRED_FLUSH is enabled, if need_flush is 0,
+			 * there is no need to send a flush to the backing device.
+			 */
+			if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
+				 (!atomic_cmpxchg(&dc->need_flush, 1, 0)))
+				goto insert_data;
+
 			/*
 			 * Also need to send a flush to the backing
 			 * device.
 			 */
-			struct bio *flush;
 
 			flush = bio_alloc_bioset(bio->bi_bdev, 0,
 						 REQ_OP_WRITE | REQ_PREFLUSH,
@@ -1050,6 +1062,9 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(s->iop.c, flush, cl);
 		}
 	} else {
+		if (BDEV_DEFERRED_FLUSH(&dc->sb))
+			atomic_set(&dc->need_flush, 1);
+
 		s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
 					     &dc->disk.bio_split);
 		/* I/O request sent to backing device */
@@ -1066,14 +1081,27 @@ static CLOSURE_CALLBACK(cached_dev_nodata)
 {
 	closure_type(s, struct search, cl);
 	struct bio *bio = &s->bio.bio;
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	if (s->iop.flush_journal)
+	if (s->iop.flush_journal) {
 		bch_journal_meta(s->iop.c, cl);
 
+		/*
+		 * When deferred flush is enabled, it is necessary to determine
+		 * whether the flush request can be sent to the backing device.
+		 */
+		if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
+				 (!atomic_cmpxchg(&dc->need_flush, 1, 0))) {
+			s->iop.status = BLK_STS_OK;
+			goto end;
+		}
+	}
+
 	/* If it's a flush, we send the flush to the backing device too */
 	bio->bi_end_io = backing_request_endio;
 	closure_bio_submit(s->iop.c, bio, cl);
 
+end:
 	continue_at(cl, cached_dev_bio_complete, NULL);
 }
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e8f696cb58c0..3f343fba2f96 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -28,6 +28,25 @@ static const char * const bch_cache_modes[] = {
 	NULL
 };
 
+/*
+ * Deferred flush: In writeback mode, reduce unnecessary PREFLUSH
+ * passed to the backend disk to speed up the performance of dsync
+ * requests.Enhance data writeback security through FUA when dirty
+ * data is written back
+ *
+ * Default is 0 ("none")
+ * none: Do nothing
+ * normal: Sequential I/O bypasses the cache disk
+ * force: Sequential I/O cannot bypass the cache disk
+ */
+static const char * const bch_deferred_flush[] = {
+	"none",
+	"normal",
+	"force",
+	NULL
+};
+
+
 static const char * const bch_reada_cache_policies[] = {
 	"all",
 	"meta-only",
@@ -151,6 +170,7 @@ rw_attribute(copy_gc_enabled);
 rw_attribute(idle_max_writeback_rate);
 rw_attribute(gc_after_writeback);
 rw_attribute(size);
+rw_attribute(deferred_flush);
 
 static ssize_t bch_snprint_string_list(char *buf,
 				       size_t size,
@@ -283,6 +303,11 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_deferred_flush)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_deferred_flush,
+					       BDEV_DEFERRED_FLUSH(&dc->sb));
+
 #undef var
 	return 0;
 }
@@ -295,6 +320,7 @@ STORE(__cached_dev)
 	ssize_t v;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	struct bio flush;
 
 	/* no user space access if system is rebooting */
 	if (bcache_is_reboot)
@@ -383,6 +409,12 @@ STORE(__cached_dev)
 			SET_BDEV_CACHE_MODE(&dc->sb, v);
 			bch_write_bdev_super(dc, NULL);
 		}
+
+		/* It's not the writeback mode that can't enable deferred_flush */
+		if (BDEV_DEFERRED_FLUSH(&dc->sb) && ((unsigned int) v != CACHE_MODE_WRITEBACK)) {
+			SET_BDEV_DEFERRED_FLUSH(&dc->sb, 0);
+			bch_write_bdev_super(dc, NULL);
+		}
 	}
 
 	if (attr == &sysfs_readahead_cache_policy) {
@@ -451,6 +483,27 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	if (attr == &sysfs_deferred_flush) {
+		v = __sysfs_match_string(bch_deferred_flush, -1, buf);
+		if (v < 0)
+			return v;
+
+		if ((unsigned int) v != BDEV_DEFERRED_FLUSH(&dc->sb)) {
+			if (v && (BDEV_CACHE_MODE(&dc->sb) != CACHE_MODE_WRITEBACK)) {
+				pr_err("It's not the writeback mode that can't enable deferred_flush.\n");
+				return -EINVAL;
+			}
+
+			SET_BDEV_DEFERRED_FLUSH(&dc->sb, v);
+			bch_write_bdev_super(dc, NULL);
+			if (v) {
+				bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+				/* I/O request sent to backing device */
+				submit_bio_wait(&flush);
+			}
+		}
+	}
+
 	return size;
 }
 
@@ -541,6 +594,7 @@ static struct attribute *bch_cached_dev_attrs[] = {
 #endif
 	&sysfs_backing_dev_name,
 	&sysfs_backing_dev_uuid,
+	&sysfs_deferred_flush,
 	NULL
 };
 ATTRIBUTE_GROUPS(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 453efbbdc8ee..ce31d1535d90 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -362,23 +362,31 @@ static CLOSURE_CALLBACK(write_dirty_finish)
 		unsigned int i;
 		struct keylist keys;
 
-		bch_keylist_init(&keys);
+		if (!BDEV_DEFERRED_FLUSH(&dc->sb)) {
+			bch_keylist_init(&keys);
 
-		bkey_copy(keys.top, &w->key);
-		SET_KEY_DIRTY(keys.top, false);
-		bch_keylist_push(&keys);
+			bkey_copy(keys.top, &w->key);
+			SET_KEY_DIRTY(keys.top, false);
+			bch_keylist_push(&keys);
 
-		for (i = 0; i < KEY_PTRS(&w->key); i++)
-			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+			for (i = 0; i < KEY_PTRS(&w->key); i++)
+				atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
+			ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
 
-		if (ret)
-			trace_bcache_writeback_collision(&w->key);
+			if (ret)
+				trace_bcache_writeback_collision(&w->key);
 
-		atomic_long_inc(ret
-				? &dc->disk.c->writeback_keys_failed
-				: &dc->disk.c->writeback_keys_done);
+			atomic_long_inc(ret
+					? &dc->disk.c->writeback_keys_failed
+					: &dc->disk.c->writeback_keys_done);
+		} else {
+			/* After flushing the backing device, update the btree */
+			spin_lock(&dc->preflush_keys.lock);
+			dc->preflush_keys.data[dc->preflush_keys.nr_keys] = *w;
+			dc->preflush_keys.nr_keys++;
+			spin_unlock(&dc->preflush_keys.lock);
+		}
 	}
 
 	bch_keybuf_del(&dc->writeback_keys, w);
@@ -435,6 +443,7 @@ static CLOSURE_CALLBACK(write_dirty)
 	if (KEY_DIRTY(&w->key)) {
 		dirty_init(w);
 		io->bio.bi_opf = REQ_OP_WRITE;
+
 		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
 		bio_set_dev(&io->bio, io->dc->bdev);
 		io->bio.bi_end_io	= dirty_endio;
@@ -471,6 +480,66 @@ static CLOSURE_CALLBACK(read_dirty_submit)
 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 
+static void flush_backing_device(struct cached_dev *dc)
+{
+	int ret;
+	unsigned int i;
+	struct bio flush;
+	struct keybuf_key *p;
+
+	if (dc->preflush_keys.nr_keys == 0)
+		return;
+
+	bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+	ret = submit_bio_wait(&flush);
+	if (ret) {
+		pr_warn("flush backing device error, ret=%d!\n", ret);
+		dc->preflush_keys.nr_keys = 0;
+		/*
+		 * Due to the flush failure, the dirty marked bkey will wait for
+		 * the next rescheduling to be written back
+		 */
+		return;
+	}
+
+	/*
+	 * The dirty data was successfully written back and confirmed to be written
+	 * to the disk. The status of the bkey in the btree was updated.
+	 */
+	for (i = 0; i < dc->preflush_keys.nr_keys; i++) {
+		int ret;
+		unsigned int j;
+		struct keylist keys;
+
+		bch_keylist_init(&keys);
+
+		p = &dc->preflush_keys.data[i];
+		bkey_copy(keys.top, &p->key);
+		SET_KEY_DIRTY(keys.top, false);
+		bch_keylist_push(&keys);
+
+		for (j = 0; j < KEY_PTRS(&p->key); j++)
+			atomic_inc(&PTR_BUCKET(dc->disk.c, &p->key, j)->pin);
+
+		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &p->key);
+
+		if (ret)
+			trace_bcache_writeback_collision(&p->key);
+
+		atomic_long_inc(ret
+				? &dc->disk.c->writeback_keys_failed
+				: &dc->disk.c->writeback_keys_done);
+
+		/* For those bkeys that failed to be inserted, you can
+		 * ignore them and they will be processed again in the
+		 * next write-back scan.
+		 */
+	}
+
+	dc->preflush_keys.nr_keys = 0;
+
+}
+
 static void read_dirty(struct cached_dev *dc)
 {
 	unsigned int delay = 0;
@@ -819,6 +888,8 @@ static int bch_writeback_thread(void *arg)
 
 		read_dirty(dc);
 
+		flush_backing_device(dc);
+
 		if (searched_full_index) {
 			unsigned int delay = dc->writeback_delay * HZ;
 
@@ -1072,6 +1143,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	/* For dc->writeback_lock contention in update_writeback_rate() */
 	dc->rate_update_retry = 0;
 
+	spin_lock_init(&dc->preflush_keys.lock);
+	dc->preflush_keys.nr_keys = 0;
+
 	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 31df716951f6..0c92a607a875 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -117,6 +117,10 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    bio_sectors(bio)))
 		return true;
 
+	/* Prevent IO from bypassing the cache disk */
+	if (BDEV_DEFERRED_FLUSH(&dc->sb) == DEFERRED_FLUSH_FORCE)
+		return true;
+
 	if (would_skip)
 		return false;
 
-- 
2.18.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re:[PATCH v2] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-04-28  7:34   ` [PATCH v2] " Zhou Jifeng
@ 2025-05-26  6:41     ` Zhou Jifeng
  2025-05-26  7:31       ` [PATCH " Coly Li
  0 siblings, 1 reply; 9+ messages in thread
From: Zhou Jifeng @ 2025-05-26  6:41 UTC (permalink / raw)
  To: 周继峰, Coly Li
  Cc: 邓旺波, kent.overstreet, linux-bcache,
	linux-kernel, 夏华

On Mon, 28 Apr 2025 at 15:36, Zhou Jifeng <zhoujifeng@kylinos.com.cn> wrote:
.....
> v1->v2: Version v2 mainly addresses the issue of low efficiency in
> writing back dirty data in version v1. When writing back dirty data,
> it no longer uses the FUA method but instead writes back no more than
> 500 dirty bkeys and then uniformly sends a PREFLUSH instruction once.
> I will supplement more test data later.
.....

Comparison test data::
https://github.com/jifengzhou/file/blob/main/bcache-deferred-patch-correlation-data.pdf

Best regards,

Zhou Jifeng

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-05-26  6:41     ` Zhou Jifeng
@ 2025-05-26  7:31       ` Coly Li
  2025-05-26  9:42         ` Zhou Jifeng
  0 siblings, 1 reply; 9+ messages in thread
From: Coly Li @ 2025-05-26  7:31 UTC (permalink / raw)
  To: Zhou Jifeng
  Cc: Coly Li, 邓旺波, kent.overstreet, linux-bcache,
	linux-kernel, 夏华

Hi Jifeng,

> 2025年5月26日 14:41,Zhou Jifeng <zhoujifeng@kylinos.com.cn> 写道:
> 
> On Mon, 28 Apr 2025 at 15:36, Zhou Jifeng <zhoujifeng@kylinos.com.cn> wrote:
> .....
>> v1->v2: Version v2 mainly addresses the issue of low efficiency in
>> writing back dirty data in version v1. When writing back dirty data,
>> it no longer uses the FUA method but instead writes back no more than
>> 500 dirty bkeys and then uniformly sends a PREFLUSH instruction once.
>> I will supplement more test data later.
> .....
> 
> Comparison test data::
> https://github.com/jifengzhou/file/blob/main/bcache-deferred-patch-correlation-data.pdf
> 

Can I access the raw data to have a look? 

And the three different testings, which parameters of bcache are modified from default?

Thank.


Coly Li

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-05-26  7:31       ` [PATCH " Coly Li
@ 2025-05-26  9:42         ` Zhou Jifeng
  2025-05-26  9:44           ` Coly Li
  0 siblings, 1 reply; 9+ messages in thread
From: Zhou Jifeng @ 2025-05-26  9:42 UTC (permalink / raw)
  To: Coly Li
  Cc: Coly Li, 邓旺波, kent.overstreet, linux-bcache,
	linux-kernel, 夏华

On Mon, 26 May 2025 at 15:42, Coly Li <i@coly.li> wrote:
>
> Hi Jifeng,
>
> > 2025年5月26日 14:41,Zhou Jifeng <zhoujifeng@kylinos.com.cn> 写道:
> >
> > On Mon, 28 Apr 2025 at 15:36, Zhou Jifeng <zhoujifeng@kylinos.com.cn> wrote:
> > .....
> >> v1->v2: Version v2 mainly addresses the issue of low efficiency in
> >> writing back dirty data in version v1. When writing back dirty data,
> >> it no longer uses the FUA method but instead writes back no more than
> >> 500 dirty bkeys and then uniformly sends a PREFLUSH instruction once.
> >> I will supplement more test data later.
> > .....
> >
> > Comparison test data::
> > https://github.com/jifengzhou/file/blob/main/bcache-deferred-patch-correlation-data.pdf
> >
>
> Can I access the raw data to have a look?
>
> And the three different testings, which parameters of bcache are modified from default?
>
> Thank.
>
>
> Coly Li

Test raw data address:
https://github.com/jifengzhou/file/tree/main/deferred%20test%20data

I have not learned the default values ​​of the parameters in the configuration file. Generally, they are 
configured according to the required characteristics, such as random IO, sequential IO, block size, etc.
The only difference between the first and second test parameters is openflags. The difference between 
the third test and the first two groups is openflags, xfersize, and seekpct. The xfersize parameter is 
used to control the access block size, and the seekpct parameter is used to control the random ratio. 0 
represents complete order and 100 represents 100% random.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] bcache: add the deferred_flush IO processing path in the writeback mode
  2025-05-26  9:42         ` Zhou Jifeng
@ 2025-05-26  9:44           ` Coly Li
  0 siblings, 0 replies; 9+ messages in thread
From: Coly Li @ 2025-05-26  9:44 UTC (permalink / raw)
  To: Zhou Jifeng
  Cc: Coly Li, 邓旺波, kent.overstreet, linux-bcache,
	linux-kernel, 夏华



> 2025年5月26日 17:42,Zhou Jifeng <zhoujifeng@kylinos.com.cn> 写道:
> 
> On Mon, 26 May 2025 at 15:42, Coly Li <i@coly.li> wrote:
>> 
>> Hi Jifeng,
>> 
>>> 2025年5月26日 14:41,Zhou Jifeng <zhoujifeng@kylinos.com.cn> 写道:
>>> 
>>> On Mon, 28 Apr 2025 at 15:36, Zhou Jifeng <zhoujifeng@kylinos.com.cn> wrote:
>>> .....
>>>> v1->v2: Version v2 mainly addresses the issue of low efficiency in
>>>> writing back dirty data in version v1. When writing back dirty data,
>>>> it no longer uses the FUA method but instead writes back no more than
>>>> 500 dirty bkeys and then uniformly sends a PREFLUSH instruction once.
>>>> I will supplement more test data later.
>>> .....
>>> 
>>> Comparison test data::
>>> https://github.com/jifengzhou/file/blob/main/bcache-deferred-patch-correlation-data.pdf
>>> 
>> 
>> Can I access the raw data to have a look?
>> 
>> And the three different testings, which parameters of bcache are modified from default?
>> 
>> Thank.
>> 
>> 
>> Coly Li
> 
> Test raw data address:
> https://github.com/jifengzhou/file/tree/main/deferred%20test%20data
> 
> I have not learned the default values ​​of the parameters in the configuration file. Generally, they are 
> configured according to the required characteristics, such as random IO, sequential IO, block size, etc.
> The only difference between the first and second test parameters is openflags. The difference between 
> the third test and the first two groups is openflags, xfersize, and seekpct. The xfersize parameter is 
> used to control the access block size, and the seekpct parameter is used to control the random ratio. 0 
> represents complete order and 100 represents 100% random.

Copied. Because the data lines from chats are too closed, I need to read them more close.

Thanks.

Coly Li

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-05-26  9:50 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-25  3:50 [PATCH] bcache: add the deferred_flush IO processing path in the writeback mode Zhou Jifeng
2025-04-25  5:46 ` Coly Li
2025-04-25  8:18   ` Zhou Jifeng
2025-04-27  6:47     ` Coly Li
2025-04-28  7:34   ` [PATCH v2] " Zhou Jifeng
2025-05-26  6:41     ` Zhou Jifeng
2025-05-26  7:31       ` [PATCH " Coly Li
2025-05-26  9:42         ` Zhou Jifeng
2025-05-26  9:44           ` Coly Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).