[for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better

public inbox for linux-bcache@vger.kernel.org
 help / color / mirror / Atom feed

* [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better
@ 2017-12-28  0:47 Michael Lyle
  2017-12-28  0:47 ` [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO Michael Lyle
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Michael Lyle @ 2017-12-28  0:47 UTC (permalink / raw)
  To: linux-bcache, linux-block; +Cc: Michael Lyle

Previously, there was some logic that attempted to immediately issue
writeback of backing-contiguous blocks when the writeback rate was
fast.

The previous logic did not have any limits on the aggregate size it
would issue, nor the number of keys it would combine at once.  It
would also discard the chance to do a contiguous write when the
writeback rate was low-- e.g. at "background" writeback of target
rate = 8, it would not combine two adjacent 4k writes and would
instead seek the disk twice.

This patch imposes limits and explicitly understands the size of
contiguous I/O during issue.  It also will combine contiguous I/O
in all circumstances, not just when writeback is requested to be
relatively fast.

It is a win on its own, but also lays the groundwork for skip writes to
short keys to make the I/O more sequential/contiguous.  It also gets
ready to start using blk_*_plug, and to allow issuing of non-contig
I/O in parallel if requested by the user (to make use of disk
throughput benefits available from higher queue depths).

This patch fixes a previous version where the contiguous information
was not calculated properly.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |   6 --
 drivers/md/bcache/writeback.c | 133 ++++++++++++++++++++++++++++++------------
 drivers/md/bcache/writeback.h |   3 +
 3 files changed, 98 insertions(+), 44 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e017e1..1784e50eb857 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -323,12 +323,6 @@ struct cached_dev {
 	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;
 
-	/*
-	 * Internal to the writeback code, so read_dirty() can keep track of
-	 * where it's at.
-	 */
-	sector_t		last_read;
-
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
 	struct task_struct	*writeback_thread;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f3d680c907ae..4e4836c6e7cf 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -248,10 +248,25 @@ static void read_dirty_submit(struct closure *cl)
 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 
+static inline bool keys_contiguous(struct cached_dev *dc,
+		struct keybuf_key *first, struct keybuf_key *second)
+{
+	if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
+		return false;
+
+	if (KEY_OFFSET(&second->key) !=
+			KEY_OFFSET(&first->key) + KEY_SIZE(&first->key))
+		return false;
+
+	return true;
+}
+
 static void read_dirty(struct cached_dev *dc)
 {
 	unsigned delay = 0;
-	struct keybuf_key *w;
+	struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
+	size_t size;
+	int nk, i;
 	struct dirty_io *io;
 	struct closure cl;
 
@@ -262,45 +277,87 @@ static void read_dirty(struct cached_dev *dc)
 	 * mempools.
 	 */
 
-	while (!kthread_should_stop()) {
-
-		w = bch_keybuf_next(&dc->writeback_keys);
-		if (!w)
-			break;
-
-		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
-
-		if (KEY_START(&w->key) != dc->last_read ||
-		    jiffies_to_msecs(delay) > 50)
-			while (!kthread_should_stop() && delay)
-				delay = schedule_timeout_interruptible(delay);
-
-		dc->last_read	= KEY_OFFSET(&w->key);
-
-		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
-			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
-			     GFP_KERNEL);
-		if (!io)
-			goto err;
-
-		w->private	= io;
-		io->dc		= dc;
-
-		dirty_init(w);
-		bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
-		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
-		bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
-		io->bio.bi_end_io	= read_dirty_endio;
-
-		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
-			goto err_free;
-
-		trace_bcache_writeback(&w->key);
+	next = bch_keybuf_next(&dc->writeback_keys);
+
+	while (!kthread_should_stop() && next) {
+		size = 0;
+		nk = 0;
+
+		do {
+			BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
+
+			/*
+			 * Don't combine too many operations, even if they
+			 * are all small.
+			 */
+			if (nk >= MAX_WRITEBACKS_IN_PASS)
+				break;
+
+			/*
+			 * If the current operation is very large, don't
+			 * further combine operations.
+			 */
+			if (size >= MAX_WRITESIZE_IN_PASS)
+				break;
+
+			/*
+			 * Operations are only eligible to be combined
+			 * if they are contiguous.
+			 *
+			 * TODO: add a heuristic willing to fire a
+			 * certain amount of non-contiguous IO per pass,
+			 * so that we can benefit from backing device
+			 * command queueing.
+			 */
+			if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
+				break;
+
+			size += KEY_SIZE(&next->key);
+			keys[nk++] = next;
+		} while ((next = bch_keybuf_next(&dc->writeback_keys)));
+
+		/* Now we have gathered a set of 1..5 keys to write back. */
+
+		for (i = 0; i < nk; i++) {
+			w = keys[i];
+
+			io = kzalloc(sizeof(struct dirty_io) +
+				     sizeof(struct bio_vec) *
+				     DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+				     GFP_KERNEL);
+			if (!io)
+				goto err;
+
+			w->private	= io;
+			io->dc		= dc;
+
+			dirty_init(w);
+			bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+			io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+			bio_set_dev(&io->bio,
+				    PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+			io->bio.bi_end_io	= read_dirty_endio;
+
+			if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+				goto err_free;
+
+			trace_bcache_writeback(&w->key);
+
+			down(&dc->in_flight);
+
+			/* We've acquired a semaphore for the maximum
+			 * simultaneous number of writebacks; from here
+			 * everything happens asynchronously.
+			 */
+			closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+		}
 
-		down(&dc->in_flight);
-		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+		delay = writeback_delay(dc, size);
 
-		delay = writeback_delay(dc, KEY_SIZE(&w->key));
+		while (!kthread_should_stop() && delay) {
+			schedule_timeout_interruptible(delay);
+			delay = writeback_delay(dc, 0);
+		}
 	}
 
 	if (0) {
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index a9e3ffb4b03c..6d26927267f8 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,9 @@
 #define CUTOFF_WRITEBACK	40
 #define CUTOFF_WRITEBACK_SYNC	70
 
+#define MAX_WRITEBACKS_IN_PASS  5
+#define MAX_WRITESIZE_IN_PASS   5000	/* *512b */
+
 static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 {
 	uint64_t i, ret = 0;
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO
  2017-12-28  0:47 [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
@ 2017-12-28  0:47 ` Michael Lyle
  2017-12-28  0:47 ` [for-416 PATCH 3/3] bcache: allow quick writeback when backing idle Michael Lyle
  2017-12-28  0:52 ` [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
  2 siblings, 0 replies; 7+ messages in thread
From: Michael Lyle @ 2017-12-28  0:47 UTC (permalink / raw)
  To: linux-bcache, linux-block; +Cc: Michael Lyle

Writeback keys are presently iterated and dispatched for writeback in
order of the logical block address on the backing device.  Multiple may
be, in parallel, read from the cache device and then written back
(especially when there are contiguous I/O).

However-- there was no guarantee with the existing code that the writes
would be issued in LBA order, as the reads from the cache device are
often re-ordered.  In turn, when writing back quickly, the backing disk
often has to seek backwards-- this slows writeback and increases
utilization.

This patch introduces an ordering mechanism that guarantees that the
original order of issue is maintained for the write portion of the I/O.
Performance for writeback is significantly improved when there are
multiple contiguous keys or high writeback rates.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |  8 ++++++++
 drivers/md/bcache/writeback.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1784e50eb857..3be0fcc19b1f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -330,6 +330,14 @@ struct cached_dev {
 
 	struct keybuf		writeback_keys;
 
+	/*
+	 * Order the write-half of writeback operations strongly in dispatch
+	 * order.  (Maintain LBA order; don't allow reads completing out of
+	 * order to re-order the writes...)
+	 */
+	struct closure_waitlist writeback_ordering_wait;
+	atomic_t		writeback_sequence_next;
+
 	/* For tracking sequential IO */
 #define RECENT_IO_BITS	7
 #define RECENT_IO	(1 << RECENT_IO_BITS)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4e4836c6e7cf..4084586d5991 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -130,6 +130,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 struct dirty_io {
 	struct closure		cl;
 	struct cached_dev	*dc;
+	uint16_t		sequence;
 	struct bio		bio;
 };
 
@@ -208,6 +209,27 @@ static void write_dirty(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
+	struct cached_dev *dc = io->dc;
+
+	uint16_t next_sequence;
+
+	if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
+		/* Not our turn to write; wait for a write to complete */
+		closure_wait(&dc->writeback_ordering_wait, cl);
+
+		if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
+			/*
+			 * Edge case-- it happened in indeterminate order
+			 * relative to when we were added to wait list..
+			 */
+			closure_wake_up(&dc->writeback_ordering_wait);
+		}
+
+		continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+		return;
+	}
+
+	next_sequence = io->sequence + 1;
 
 	/*
 	 * IO errors are signalled using the dirty bit on the key.
@@ -225,6 +247,9 @@ static void write_dirty(struct closure *cl)
 		closure_bio_submit(&io->bio, cl);
 	}
 
+	atomic_set(&dc->writeback_sequence_next, next_sequence);
+	closure_wake_up(&dc->writeback_ordering_wait);
+
 	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
 
@@ -269,7 +294,10 @@ static void read_dirty(struct cached_dev *dc)
 	int nk, i;
 	struct dirty_io *io;
 	struct closure cl;
+	uint16_t sequence = 0;
 
+	BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
+	atomic_set(&dc->writeback_sequence_next, sequence);
 	closure_init_stack(&cl);
 
 	/*
@@ -330,6 +358,7 @@ static void read_dirty(struct cached_dev *dc)
 
 			w->private	= io;
 			io->dc		= dc;
+			io->sequence    = sequence++;
 
 			dirty_init(w);
 			bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [for-416 PATCH 3/3] bcache: allow quick writeback when backing idle
  2017-12-28  0:47 [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
  2017-12-28  0:47 ` [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO Michael Lyle
@ 2017-12-28  0:47 ` Michael Lyle
  2017-12-28  0:52 ` [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
  2 siblings, 0 replies; 7+ messages in thread
From: Michael Lyle @ 2017-12-28  0:47 UTC (permalink / raw)
  To: linux-bcache, linux-block; +Cc: Michael Lyle

If the control system would wait for at least half a second, and there's
been no reqs hitting the backing disk for awhile: use an alternate mode
where we have at most one contiguous set of writebacks in flight at a
time. (But don't otherwise delay).  If front-end IO appears, it will
still be quick, as it will only have to contend with one real operation
in flight.  But otherwise, we'll be sending data to the backing disk as
quickly as it can accept it (with one op at a time).

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |  7 +++++++
 drivers/md/bcache/request.c   |  1 +
 drivers/md/bcache/writeback.c | 21 +++++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 3be0fcc19b1f..5f7b0b2513cc 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -320,6 +320,13 @@ struct cached_dev {
 	 */
 	atomic_t		has_dirty;
 
+	/*
+	 * Set to zero by things that touch the backing volume-- except
+	 * writeback.  Incremented by writeback.  Used to determine when to
+	 * accelerate idle writeback.
+	 */
+	atomic_t		backing_idle;
+
 	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d1faaba6b93f..3b4defbdcbbd 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -996,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	int rw = bio_data_dir(bio);
 
+	atomic_set(&dc->backing_idle, 0);
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 	bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4084586d5991..bf309a480335 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -383,6 +383,27 @@ static void read_dirty(struct cached_dev *dc)
 
 		delay = writeback_delay(dc, size);
 
+		/* If the control system would wait for at least half a
+		 * second, and there's been no reqs hitting the backing disk
+		 * for awhile: use an alternate mode where we have at most
+		 * one contiguous set of writebacks in flight at a time.  If
+		 * someone wants to do IO it will be quick, as it will only
+		 * have to contend with one operation in flight, and we'll
+		 * be round-tripping data to the backing disk as quickly as
+		 * it can accept it.
+		 */
+		if (delay >= HZ / 2) {
+			/* 3 means at least 1.5 seconds, up to 7.5 if we
+			 * have slowed way down.
+			 */
+			if (atomic_inc_return(&dc->backing_idle) >= 3) {
+				/* Wait for current I/Os to finish */
+				closure_sync(&cl);
+				/* And immediately launch a new set. */
+				delay = 0;
+			}
+		}
+
 		while (!kthread_should_stop() && delay) {
 			schedule_timeout_interruptible(delay);
 			delay = writeback_delay(dc, 0);
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better
  2017-12-28  0:47 [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
  2017-12-28  0:47 ` [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO Michael Lyle
  2017-12-28  0:47 ` [for-416 PATCH 3/3] bcache: allow quick writeback when backing idle Michael Lyle
@ 2017-12-28  0:52 ` Michael Lyle
  2 siblings, 0 replies; 7+ messages in thread
From: Michael Lyle @ 2017-12-28  0:52 UTC (permalink / raw)
  To: linux-bcache, linux-block

Hey everyone,

These were previously sent and there was an extension discussion about
performance, finding that there was some slight regression on very large
extent workloads and significant performance improvement on smaller
extents.  This is a *huge* performance increase on my workload,
especially the behavior enabled in the #3 patch that allows dirty data
to be trimmed quickly during idle periods.

Reviewed-by tags would be appreciated so that these can be staged for 4.16.

I have additional changes coming in the next couple of days-- changes to
the closure code picked from kmo's bcachefs tree that both improve
performance and enhance correctness.  Just need a little more test and
to work on the commit comments some.

Thanks,

Mike

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO
@ 2017-12-28  1:27 tang.junhui
  0 siblings, 0 replies; 7+ messages in thread
From: tang.junhui @ 2017-12-28  1:27 UTC (permalink / raw)
  To: mlyle; +Cc: linux-bcache, linux-block, tang.junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

LGTM, and I tested it, it promotes the write-back performance.

Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn>
Tested-by: Tang Junhui <tang.junhui@zte.com.cn>

> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> In such scenario that there are some flash only volumes
> , and some cached devices, when many tasks request these devices in
> writeback mode, the write IOs may fall to the same bucket as bellow:
> | cached data | flash data | cached data | cached data| flash data|
> then after writeback of these cached devices, the bucket would
> be like bellow bucket:
> | free | flash data | free | free | flash data |
> 
> So, there are many free space in this bucket, but since data of flash
> only volumes still exists, so this bucket cannot be reclaimable,
> which would cause waste of bucket space.
> 
> In this patch, we segregate flash only volume write streams from
> cached devices, so data from flash only volumes and cached devices
> can store in different buckets.
> 
> Compare to v1 patch, this patch do not add a additionally open bucket
> list, and it is try best to segregate flash only volume write streams
> from cached devices, sectors of flash only volumes may still be mixed
> with dirty sectors of cached device, but the number is very small.
> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> ---
>  drivers/md/bcache/alloc.c | 19 ++++++++++++++-----
>  1 file changed, 14 insertions(+), 5 deletions(-)
>  mode change 100644 => 100755 drivers/md/bcache/alloc.c
> 
> diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
> old mode 100644
> new mode 100755
> index 0803563..4302922
> --- a/drivers/md/bcache/alloc.c
> +++ b/drivers/md/bcache/alloc.c
> @@ -514,15 +514,21 @@ struct open_bucket {
>  
>  /*
>   * We keep multiple buckets open for writes, and try to segregate different
> - * write streams for better cache utilization: first we look for a bucket where
> - * the last write to it was sequential with the current write, and failing that
> - * we look for a bucket that was last used by the same task.
> + * write streams for better cache utilization: first we try to segregate flash
> + * only volume write streams from cached devices, secondly we look for a bucket
> + * where the last write to it was sequential with the current write, and
> + * failing that we look for a bucket that was last used by the same task.
>   *
>   * The ideas is if you've got multiple tasks pulling data into the cache at the
>   * same time, you'll get better cache utilization if you try to segregate their
>   * data and preserve locality.
>   *
> - * For example, say you've starting Firefox at the same time you're copying a
> + * For example, dirty sectors of flash only volume is not reclaimable, if their
> + * dirty sectors mixed with dirty sectors of cached device, such buckets will 
> + * be marked as dirty and won't be reclaimed, though the dirty data of cached 
> + * device have been written back to backend device.
> + *
> + * And say you've starting Firefox at the same time you're copying a
>   * bunch of files. Firefox will likely end up being fairly hot and stay in the
>   * cache awhile, but the data you copied might not be; if you wrote all that
>   * data to the same buckets it'd get invalidated at the same time.
> @@ -539,7 +545,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
>      struct open_bucket *ret, *ret_task = NULL;
>  
>      list_for_each_entry_reverse(ret, &c->data_buckets, list)
> -        if (!bkey_cmp(&ret->key, search))
> +        if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) != 
> +            UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
> +            continue;
> +        else if (!bkey_cmp(&ret->key, search))
>              goto found;
>          else if (ret->last_write_point == write_point)
>              ret_task = ret;
> -- 
> 1.8.3.1

Thanks,
Tang

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO
@ 2017-12-28  1:30 tang.junhui
  2017-12-28  2:38 ` Michael Lyle
  0 siblings, 1 reply; 7+ messages in thread
From: tang.junhui @ 2017-12-28  1:30 UTC (permalink / raw)
  To: mlyle; +Cc: linux-bcache, linux-block, tang.junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

LGTM, and I tested it, it promotes the write-back performance.
[Sorry for the wrong content in the previous email]

Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn>
Tested-by: Tang Junhui <tang.junhui@zte.com.cn>

> Writeback keys are presently iterated and dispatched for writeback in
> order of the logical block address on the backing device.  Multiple may
> be, in parallel, read from the cache device and then written back
> (especially when there are contiguous I/O).
> 
> However-- there was no guarantee with the existing code that the writes
> would be issued in LBA order, as the reads from the cache device are
> often re-ordered.  In turn, when writing back quickly, the backing disk
> often has to seek backwards-- this slows writeback and increases
> utilization.
> 
> This patch introduces an ordering mechanism that guarantees that the
> original order of issue is maintained for the write portion of the I/O.
> Performance for writeback is significantly improved when there are
> multiple contiguous keys or high writeback rates.
> 
> Signed-off-by: Michael Lyle <mlyle@lyle.org>
> ---
>  drivers/md/bcache/bcache.h    |  8 ++++++++
>  drivers/md/bcache/writeback.c | 29 +++++++++++++++++++++++++++++
>  2 files changed, 37 insertions(+)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 1784e50eb857..3be0fcc19b1f 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -330,6 +330,14 @@ struct cached_dev {
>  
>      struct keybuf        writeback_keys;
>  
> +    /*
> +     * Order the write-half of writeback operations strongly in dispatch
> +     * order.  (Maintain LBA order; don't allow reads completing out of
> +     * order to re-order the writes...)
> +     */
> +    struct closure_waitlist writeback_ordering_wait;
> +    atomic_t        writeback_sequence_next;
> +
>      /* For tracking sequential IO */
>  #define RECENT_IO_BITS    7
>  #define RECENT_IO    (1 << RECENT_IO_BITS)
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 4e4836c6e7cf..4084586d5991 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -130,6 +130,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
>  struct dirty_io {
>      struct closure        cl;
>      struct cached_dev    *dc;
> +    uint16_t        sequence;
>      struct bio        bio;
>  };
>  
> @@ -208,6 +209,27 @@ static void write_dirty(struct closure *cl)
>  {
>      struct dirty_io *io = container_of(cl, struct dirty_io, cl);
>      struct keybuf_key *w = io->bio.bi_private;
> +    struct cached_dev *dc = io->dc;
> +
> +    uint16_t next_sequence;
> +
> +    if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
> +        /* Not our turn to write; wait for a write to complete */
> +        closure_wait(&dc->writeback_ordering_wait, cl);
> +
> +        if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
> +            /*
> +             * Edge case-- it happened in indeterminate order
> +             * relative to when we were added to wait list..
> +             */
> +            closure_wake_up(&dc->writeback_ordering_wait);
> +        }
> +
> +        continue_at(cl, write_dirty, io->dc->writeback_write_wq);
> +        return;
> +    }
> +
> +    next_sequence = io->sequence + 1;
>  
>      /*
>       * IO errors are signalled using the dirty bit on the key.
> @@ -225,6 +247,9 @@ static void write_dirty(struct closure *cl)
>          closure_bio_submit(&io->bio, cl);
>      }
>  
> +    atomic_set(&dc->writeback_sequence_next, next_sequence);
> +    closure_wake_up(&dc->writeback_ordering_wait);
> +
>      continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
>  }
>  
> @@ -269,7 +294,10 @@ static void read_dirty(struct cached_dev *dc)
>      int nk, i;
>      struct dirty_io *io;
>      struct closure cl;
> +    uint16_t sequence = 0;
>  
> +    BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
> +    atomic_set(&dc->writeback_sequence_next, sequence);
>      closure_init_stack(&cl);
>  
>      /*
> @@ -330,6 +358,7 @@ static void read_dirty(struct cached_dev *dc)
>  
>              w->private    = io;
>              io->dc        = dc;
> +            io->sequence    = sequence++;
>  
>              dirty_init(w);
>              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
> -- 
> 2.14.1

Thanks,
Tang

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO
  2017-12-28  1:30 tang.junhui
@ 2017-12-28  2:38 ` Michael Lyle
  0 siblings, 0 replies; 7+ messages in thread
From: Michael Lyle @ 2017-12-28  2:38 UTC (permalink / raw)
  To: tang.junhui; +Cc: linux-bcache, linux-block

Hi Tang Junhui--

On 12/27/2017 05:30 PM, tang.junhui@zte.com.cn wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> LGTM, and I tested it, it promotes the write-back performance.
> [Sorry for the wrong content in the previous email]
> 
> Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn>
> Tested-by: Tang Junhui <tang.junhui@zte.com.cn>

Thank you very much for the review.  Does this also apply to 1/3 (which
is needed for this patch to apply)?

Mike

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-12-28  2:38 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-28  0:47 [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
2017-12-28  0:47 ` [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO Michael Lyle
2017-12-28  0:47 ` [for-416 PATCH 3/3] bcache: allow quick writeback when backing idle Michael Lyle
2017-12-28  0:52 ` [for-416 PATCH 1/3] bcache: writeback: collapse contiguous IO better Michael Lyle
  -- strict thread matches above, loose matches on Subject: below --
2017-12-28  1:27 [for-416 PATCH 2/3] bcache: writeback: properly order backing device IO tang.junhui
2017-12-28  1:30 tang.junhui
2017-12-28  2:38 ` Michael Lyle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox