[PATCH v2] Btrfs: enchanse raid1/10 balance heuristic

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic
@ 2017-12-29  2:09 Timofey Titovets
  2017-12-29 18:44 ` Dmitrii Tcvetkov
  0 siblings, 1 reply; 6+ messages in thread
From: Timofey Titovets @ 2017-12-29  2:09 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Timofey Titovets

Currently btrfs raid1/10 balancer bаlance requests to mirrors,
based on pid % num of mirrors.

Make logic understood:
 - if one of underline devices are non rotational
 - Queue leght to underline devices

By default try use pid % num_mirrors guessing, but:
 - If one of mirrors are non rotational, repick optimal to it
 - If underline mirror have less queue leght then optimal,
   repick to that mirror

For avoid round-robin request balancing,
lets round down queue leght:
 - By 8 for rotational devs
 - By 2 for all non rotational devs

Changes:
  v1 -> v2:
    - Use helper part_in_flight() from genhd.c
      to get queue lenght
    - Move guess code to guess_optimal()
    - Change balancer logic, try use pid % mirror by default
      Make balancing on spinning rust if one of underline devices
      are overloaded

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
 block/genhd.c      |   1 +
 fs/btrfs/volumes.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 96a66f671720..a7742bbbb6a7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
 				atomic_read(&part->in_flight[1]);
 	}
 }
+EXPORT_SYMBOL_GPL(part_in_flight);
 
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9a04245003ab..1c84534df9a5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
+#include <linux/genhd.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -5216,6 +5217,112 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	return ret;
 }
 
+/**
+ * bdev_get_queue_len - return rounded down in flight queue lenght of bdev
+ *
+ * @bdev: target bdev
+ * @round_down: round factor big for hdd and small for ssd, like 8 and 2
+ */
+static int bdev_get_queue_len(struct block_device *bdev, int round_down)
+{
+	int sum;
+	struct hd_struct *bd_part = bdev->bd_part;
+	struct request_queue *rq = bdev_get_queue(bdev);
+	uint32_t inflight[2] = {0, 0};
+
+	part_in_flight(rq, bd_part, inflight);
+
+	sum = max_t(uint32_t, inflight[0], inflight[1]);
+
+	/*
+	 * Try prevent switch for every sneeze
+	 * By roundup output num by some value
+	 */
+	return ALIGN_DOWN(sum, round_down);
+}
+
+/**
+ * guess_optimal - return guessed optimal mirror
+ *
+ * Optimal expected to be pid % num_stripes
+ *
+ * That's generaly ok for spread load
+ * Add some balancer based on queue leght to device
+ *
+ * Basic ideas:
+ *  - Sequential read generate low amount of request
+ *    so if load of drives are equal, use pid % num_stripes balancing
+ *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
+ *    and repick if other dev have "significant" less queue lenght
+ *  - Repick optimal if queue leght of other mirror are less
+ */
+static int guess_optimal(struct map_lookup *map, int optimal)
+{
+	int i;
+	int round_down = 8;
+	int num = map->num_stripes;
+	int qlen[num];
+	bool is_nonrot[num];
+	bool all_bdev_nonrot = true;
+	bool all_bdev_rotate = true;
+	struct block_device *bdev;
+
+	if (num == 1)
+		return optimal;
+
+	/* Check accessible bdevs */
+	for (i = 0; i < num; i++) {
+		/* Init for missing bdevs */
+		is_nonrot[i] = false;
+		qlen[i] = INT_MAX;
+		bdev = map->stripes[i].dev->bdev;
+		if (bdev) {
+			qlen[i] = 0;
+			is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
+			if (is_nonrot[i])
+				all_bdev_rotate = false;
+			else
+				all_bdev_nonrot = false;
+		}
+	}
+
+	/*
+	 * Don't bother with computation
+	 * if only one of two bdevs are accessible
+	 */
+	if (num == 2 && qlen[0] != qlen[1]) {
+		if (qlen[0] < qlen[1])
+			return 0;
+		else
+			return 1;
+	}
+
+	if (all_bdev_nonrot)
+		round_down = 2;
+
+	for (i = 0; i < num; i++) {
+		if (qlen[i])
+			continue;
+		bdev = map->stripes[i].dev->bdev;
+		qlen[i] = bdev_get_queue_len(bdev, round_down);
+	}
+
+	/* For mixed case, pick non rotational dev as optimal */
+	if (all_bdev_rotate == all_bdev_nonrot) {
+		for (i = 0; i < num; i++) {
+			if (is_nonrot[i])
+				optimal = i;
+		}
+	}
+
+	for (i = 0; i < num; i++) {
+		if (qlen[optimal] > qlen[i])
+			optimal = i;
+	}
+
+	return optimal;
+}
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct map_lookup *map, int first, int num,
 			    int optimal, int dev_replace_is_ongoing)
@@ -5664,6 +5771,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 	int i;
 	int ret = 0;
 	int num_stripes;
+	int optimal;
 	int max_errors = 0;
 	int tgtdev_indexes = 0;
 	struct btrfs_bio *bbio = NULL;
@@ -5776,9 +5884,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 		else {
+			optimal = guess_optimal(map,
+					current->pid % map->num_stripes);
 			stripe_index = find_live_mirror(fs_info, map, 0,
 					    map->num_stripes,
-					    current->pid % map->num_stripes,
+					    optimal,
 					    dev_replace_is_ongoing);
 			mirror_num = stripe_index + 1;
 		}
@@ -5804,10 +5914,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			stripe_index += mirror_num - 1;
 		else {
 			int old_stripe_index = stripe_index;
+			optimal = guess_optimal(map,
+					current->pid % map->num_stripes);
 			stripe_index = find_live_mirror(fs_info, map,
 					      stripe_index,
 					      map->sub_stripes, stripe_index +
-					      current->pid % map->sub_stripes,
+					      optimal,
 					      dev_replace_is_ongoing);
 			mirror_num = stripe_index - old_stripe_index + 1;
 		}
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic
  2017-12-29  2:09 [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic Timofey Titovets
@ 2017-12-29 18:44 ` Dmitrii Tcvetkov
  2017-12-29 19:14   ` Dmitrii Tcvetkov
  0 siblings, 1 reply; 6+ messages in thread
From: Dmitrii Tcvetkov @ 2017-12-29 18:44 UTC (permalink / raw)
  To: Timofey Titovets, linux-btrfs

On Fri, 29 Dec 2017 05:09:14 +0300
Timofey Titovets <nefelim4ag@gmail.com> wrote:

> Currently btrfs raid1/10 balancer balance requests to mirrors,
> based on pid % num of mirrors.
> 
> Make logic understood:
>  - if one of underline devices are non rotational
>  - Queue leght to underline devices
> 
> By default try use pid % num_mirrors guessing, but:
>  - If one of mirrors are non rotational, repick optimal to it
>  - If underline mirror have less queue leght then optimal,
>    repick to that mirror
> 
> For avoid round-robin request balancing,
> lets round down queue leght:
>  - By 8 for rotational devs
>  - By 2 for all non rotational devs
> 
> Changes:
>   v1 -> v2:
>     - Use helper part_in_flight() from genhd.c
>       to get queue lenght
>     - Move guess code to guess_optimal()
>     - Change balancer logic, try use pid % mirror by default
>       Make balancing on spinning rust if one of underline devices
>       are overloaded
> 
> Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
> ---
>  block/genhd.c      |   1 +
>  fs/btrfs/volumes.c | 116
> ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files
> changed, 115 insertions(+), 2 deletions(-)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 96a66f671720..a7742bbbb6a7 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct
> hd_struct *part, atomic_read(&part->in_flight[1]);
>  	}
>  }
> +EXPORT_SYMBOL_GPL(part_in_flight);
>  
>  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
>  {
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 9a04245003ab..1c84534df9a5 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -27,6 +27,7 @@
>  #include <linux/raid/pq.h>
>  #include <linux/semaphore.h>
>  #include <linux/uuid.h>
> +#include <linux/genhd.h>
>  #include <asm/div64.h>
>  #include "ctree.h"
>  #include "extent_map.h"
> @@ -5216,6 +5217,112 @@ int btrfs_is_parity_mirror(struct
> btrfs_fs_info *fs_info, u64 logical, u64 len) return ret;
>  }
>  
> +/**
> + * bdev_get_queue_len - return rounded down in flight queue lenght
> of bdev
> + *
> + * @bdev: target bdev
> + * @round_down: round factor big for hdd and small for ssd, like 8
> and 2
> + */
> +static int bdev_get_queue_len(struct block_device *bdev, int
> round_down) +{
> +	int sum;
> +	struct hd_struct *bd_part = bdev->bd_part;
> +	struct request_queue *rq = bdev_get_queue(bdev);
> +	uint32_t inflight[2] = {0, 0};
> +
> +	part_in_flight(rq, bd_part, inflight);
> +
> +	sum = max_t(uint32_t, inflight[0], inflight[1]);
> +
> +	/*
> +	 * Try prevent switch for every sneeze
> +	 * By roundup output num by some value
> +	 */
> +	return ALIGN_DOWN(sum, round_down);
> +}
> +
> +/**
> + * guess_optimal - return guessed optimal mirror
> + *
> + * Optimal expected to be pid % num_stripes
> + *
> + * That's generaly ok for spread load
> + * Add some balancer based on queue leght to device
> + *
> + * Basic ideas:
> + *  - Sequential read generate low amount of request
> + *    so if load of drives are equal, use pid % num_stripes balancing
> + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
> + *    and repick if other dev have "significant" less queue lenght
> + *  - Repick optimal if queue leght of other mirror are less
> + */
> +static int guess_optimal(struct map_lookup *map, int optimal)
> +{
> +	int i;
> +	int round_down = 8;
> +	int num = map->num_stripes;

num has to be initialized from map->sub_stripes if we're reading RAID10,
otherwise there will be NULL pointer dereference

> +	int qlen[num];
> +	bool is_nonrot[num];
> +	bool all_bdev_nonrot = true;
> +	bool all_bdev_rotate = true;
> +	struct block_device *bdev;
> +
> +	if (num == 1)
> +		return optimal;
> +
> +	/* Check accessible bdevs */
> +	for (i = 0; i < num; i++) {
> +		/* Init for missing bdevs */
> +		is_nonrot[i] = false;
> +		qlen[i] = INT_MAX;
> +		bdev = map->stripes[i].dev->bdev;
> +		if (bdev) {
> +			qlen[i] = 0;
> +			is_nonrot[i] =
> blk_queue_nonrot(bdev_get_queue(bdev));
> +			if (is_nonrot[i])
> +				all_bdev_rotate = false;
> +			else
> +				all_bdev_nonrot = false;
> +		}
> +	}
> +
> +	/*
> +	 * Don't bother with computation
> +	 * if only one of two bdevs are accessible
> +	 */
> +	if (num == 2 && qlen[0] != qlen[1]) {
> +		if (qlen[0] < qlen[1])
> +			return 0;
> +		else
> +			return 1;
> +	}
> +
> +	if (all_bdev_nonrot)
> +		round_down = 2;
> +
> +	for (i = 0; i < num; i++) {
> +		if (qlen[i])
> +			continue;
> +		bdev = map->stripes[i].dev->bdev;
> +		qlen[i] = bdev_get_queue_len(bdev, round_down);
> +	}
> +
> +	/* For mixed case, pick non rotational dev as optimal */
> +	if (all_bdev_rotate == all_bdev_nonrot) {
> +		for (i = 0; i < num; i++) {
> +			if (is_nonrot[i])
> +				optimal = i;

just a nitpick: we might want to "break" here so we get first nonrot
bdev, current variant will get last nonrot bdev. This is not relevant
right now since we don't have N-way-mirroring yet and the code always
picks between two bdevs, but when it comes we might want to change it.

Here are my benchmark results with your fio config (except that
dataset is 8 Gb) in VM with 2 LVs, LVs are on different physical
devices. Patch applied on top of current mainline
(2758b3e3e630ba304fc4), all RAID1 on idle system. Unfortunately I
don't have 2 SSDs right now, so can test only HDD+SSD, 2 HDD RAID1,
4 HDD RAID10 or 3 HDD + 1 SSD RAID10:

mainline, fio got lucky to read from first HDD (quite slow HDD):
Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS][eta
00m:00s] test-fio: (groupid=0, jobs=1): err= 0: pid=1961: Fri Dec 29
18:21:57 2017 read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec)
    slat (usec): min=9, max=102818, avg=332.68, stdev=3583.09
    clat (msec): min=2, max=825, avg=59.84, stdev=64.95
     lat (msec): min=2, max=825, avg=60.17, stdev=65.06
    clat percentiles (msec):
     |  1.00th=[    6],  5.00th=[    8], 10.00th=[    9],
20.00th=[   14], | 30.00th=[   20], 40.00th=[   27], 50.00th=[   36],
60.00th=[   50], | 70.00th=[   68], 80.00th=[   95], 90.00th=[  144],
95.00th=[  194], | 99.00th=[  300], 99.50th=[  355], 99.90th=[  493],
99.95th=[  531], | 99.99th=[  617]
   bw (  KiB/s): min= 4800, max= 9792, per=100.00%, avg=8508.15,
stdev=963.02, samples=120 iops        : min=  150, max=  306,
avg=265.83, stdev=30.11, samples=120 lat (msec)   : 4=0.20%, 10=11.84%,
20=19.76%, 50=28.48%, 100=20.94% lat (msec)   : 250=16.66%, 500=2.03%,
750=0.08%, 1000=0.01% cpu          : usr=0.15%, sys=0.84%, ctx=15358,
majf=0, minf=136 IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%,
16=99.9%, 32=0.0%, >=64=0.0% submit    : 0=0.0%, 4=100.0%, 8=0.0%,
16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete  : 0=0.0%, 4=100.0%,
8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwt:
total=15971,0,0, short=0,0,0, dropped=0,0,0 latency   : target=0,
window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   READ: bw=8508KiB/s (8712kB/s), 8508KiB/s-8508KiB/s
   (8712kB/s-8712kB/s), io=499MiB (523MB), run=60070-60070msec
----------------------------------------------------------------------------

mainline, fio got lucky to read from second HDD (much more modern):
Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS][eta
10m:43s] test-fio: (groupid=0, jobs=1): err= 0: pid=1950: Fri Dec 29
18:20:09 2017 read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec)
    slat (usec): min=9, max=54852, avg=37.29, stdev=646.17
    clat (usec): min=387, max=644258, avg=42274.85, stdev=48504.35
     lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56
    clat percentiles (msec):
     |  1.00th=[    4],  5.00th=[    6], 10.00th=[    7],
20.00th=[   10], | 30.00th=[   14], 40.00th=[   19], 50.00th=[   26],
60.00th=[   34], | 70.00th=[   47], 80.00th=[   66], 90.00th=[  101],
95.00th=[  138], | 99.00th=[  232], 99.50th=[  275], 99.90th=[  393],
99.95th=[  430], | 99.99th=[  558]
   bw (  KiB/s): min= 7232, max=14016, per=99.99%, avg=12097.14,
stdev=1134.12, samples=120 iops        : min=  226, max=  438,
avg=378.01, stdev=35.44, samples=120 lat (usec)   : 500=0.01%
  lat (msec)   : 4=1.42%, 10=19.57%, 20=22.36%, 50=28.65%, 100=17.96%
  lat (msec)   : 250=9.23%, 500=0.78%, 750=0.02%
  cpu          : usr=0.23%, sys=1.23%, ctx=22717, majf=0, minf=135
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=99.9%, 32=0.0%,
>=64=0.0% submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%,
>64=0.0%, >=64=0.0%  
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%,
>=64=0.0% issued rwt: total=22704,0,0, short=0,0,0, dropped=0,0,0  
     latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   READ: bw=11.8MiB/s (12.4MB/s), 11.8MiB/s-11.8MiB/s
   (12.4MB/s-12.4MB/s), io=710MiB (744MB), run=60051-60051msec
----------------------------------------------------------------------------
mainline, fio got lucky to read from an SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0
IOPS][eta 00m:00s] test-fio: (groupid=0, jobs=1): err= 0: pid=1860: Fri
Dec 29 18:10:53 2017 read: IOPS=13.9k, BW=433MiB/s
(454MB/s)(25.4GiB/60002msec) slat (usec): min=8, max=6811, avg=22.27,
stdev=16.00 clat (usec): min=289, max=16306, avg=1129.79, stdev=243.36
     lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36
    clat percentiles (usec):
     |  1.00th=[  627],  5.00th=[  783], 10.00th=[  881],
20.00th=[  996], | 30.00th=[ 1057], 40.00th=[ 1106], 50.00th=[ 1123],
60.00th=[ 1172], | 70.00th=[ 1205], 80.00th=[ 1270], 90.00th=[ 1352],
95.00th=[ 1434], | 99.00th=[ 1582], 99.50th=[ 1647], 99.90th=[ 2671],
99.95th=[ 3818], | 99.99th=[ 9372]
   bw (  KiB/s): min=311552, max=447296, per=99.98%, avg=443356.12,
stdev=13068.69, samples=120 iops        : min= 9736, max=13978,
avg=13854.85, stdev=408.40, samples=120 lat (usec)   : 500=0.08%,
750=3.68%, 1000=17.33% lat (msec)   : 2=78.76%, 4=0.12%, 10=0.04%,
20=0.01% cpu          : usr=3.00%, sys=37.46%, ctx=810456, majf=0,
minf=137 IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%,
32=0.0%, >=64=0.0% submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%,
32=0.0%, 64=0.0%, >=64=0.0% complete  : 0=0.0%, 4=100.0%, 8=0.0%,
16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwt: total=831502,0,0,
short=0,0,0, dropped=0,0,0 latency   : target=0, window=0,
percentile=100.00%, depth=16

Run status group 0 (all jobs):
   READ: bw=433MiB/s (454MB/s), 433MiB/s-433MiB/s (454MB/s-454MB/s),
   io=25.4GiB (27.2GB), run=60002-60002msec
------------------------------------------------------------------------------
With the patch, 2 HDDs:
Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS][eta
00m:00s] test-fio: (groupid=0, jobs=1): err= 0: pid=1906: Fri Dec 29
17:42:56 2017 read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec)
    slat (usec): min=10, max=48614, avg=29.42, stdev=289.59
    clat (usec): min=408, max=341006, avg=28481.65, stdev=29999.58
     lat (usec): min=435, max=341037, avg=28511.64, stdev=30000.14
    clat percentiles (msec):
     |  1.00th=[    4],  5.00th=[    6], 10.00th=[    7],
20.00th=[    9], | 30.00th=[   11], 40.00th=[   14], 50.00th=[   18],
60.00th=[   23], | 70.00th=[   31], 80.00th=[   43], 90.00th=[   66],
95.00th=[   89], | 99.00th=[  146], 99.50th=[  174], 99.90th=[  245],
99.95th=[  268], | 99.99th=[  292]
   bw (  KiB/s): min=10304, max=19968, per=100.00%, avg=17955.02,
stdev=1408.17, samples=120 iops        : min=  322, max=  624,
avg=561.00, stdev=44.01, samples=120 lat (usec)   : 500=0.02%,
750=0.01%, 1000=0.01% lat (msec)   : 2=0.01%, 4=1.10%, 10=25.46%,
20=28.65%, 50=28.75% lat (msec)   : 100=12.38%, 250=3.54%, 500=0.09%
  cpu          : usr=0.36%, sys=1.79%, ctx=32225, majf=1, minf=136
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%,
>=64=0.0% submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%,
>64=0.0%, >=64=0.0%  
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%,
>=64=0.0% issued rwt: total=33688,0,0, short=0,0,0, dropped=0,0,0  
     latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
   READ: bw=17.5MiB/s (18.4MB/s), 17.5MiB/s-17.5MiB/s
   (18.4MB/s-18.4MB/s), io=1053MiB (1104MB), run=60052-60052msec
--------------------------------------------------------------------------------
With the patch, HDD(old one)+SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0
IOPS][eta 00m:00s] test-fio: (groupid=0, jobs=1): err= 0: pid=1905: Fri
Dec 29 17:50:56 2017 read: IOPS=11.6k, BW=361MiB/s
(379MB/s)(21.2GiB/60084msec) slat (usec): min=9, max=1587, avg=19.14,
stdev= 8.96 clat (usec): min=280, max=346726, avg=1362.09, stdev=6947.89
     lat (usec): min=363, max=346752, avg=1381.73, stdev=6948.32
    clat percentiles (usec):
     |  1.00th=[   449],  5.00th=[   506], 10.00th=[   537],
20.00th=[   586], | 30.00th=[   619], 40.00th=[   644],
50.00th=[   676], 60.00th=[   709], | 70.00th=[   734],
80.00th=[   775], 90.00th=[   840], 95.00th=[   922], |
99.00th=[ 20841], 99.50th=[ 43779], 99.90th=[106431], 99.95th=[135267],
| 99.99th=[210764] bw (  KiB/s): min=281229, max=384064, per=100.00%,
avg=370417.13, stdev=9942.41, samples=120 iops        : min= 8788,
max=12002, avg=11575.50, stdev=310.74, samples=120 lat (usec)   :
500=4.52%, 750=69.32%, 1000=22.82% lat (msec)   : 2=1.16%, 4=0.07%,
10=0.51%, 20=0.58%, 50=0.60% lat (msec)   : 100=0.31%, 250=0.11%,
500=0.01% cpu          : usr=3.75%, sys=25.94%, ctx=484100, majf=1,
minf=138 IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%,
32=0.0%, >=64=0.0% submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%,
32=0.0%, 64=0.0%, >=64=0.0% complete  : 0=0.0%, 4=100.0%, 8=0.0%,
16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwt: total=694625,0,0,
short=0,0,0, dropped=0,0,0 latency   : target=0, window=0,
percentile=100.00%, depth=16

Run status group 0 (all jobs):
   READ: bw=361MiB/s (379MB/s), 361MiB/s-361MiB/s (379MB/s-379MB/s),
   io=21.2GiB (22.8GB), run=60084-60084msec

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic
  2017-12-29 18:44 ` Dmitrii Tcvetkov
@ 2017-12-29 19:14   ` Dmitrii Tcvetkov
  2017-12-30  0:15     ` Timofey Titovets
  0 siblings, 1 reply; 6+ messages in thread
From: Dmitrii Tcvetkov @ 2017-12-29 19:14 UTC (permalink / raw)
  To: Timofey Titovets, linux-btrfs

On Fri, 29 Dec 2017 21:44:19 +0300
Dmitrii Tcvetkov <demfloro@demfloro.ru> wrote:
> > +/**
> > + * guess_optimal - return guessed optimal mirror
> > + *
> > + * Optimal expected to be pid % num_stripes
> > + *
> > + * That's generaly ok for spread load
> > + * Add some balancer based on queue leght to device
> > + *
> > + * Basic ideas:
> > + *  - Sequential read generate low amount of request
> > + *    so if load of drives are equal, use pid % num_stripes
> > balancing
> > + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as
> > optimal
> > + *    and repick if other dev have "significant" less queue lenght
> > + *  - Repick optimal if queue leght of other mirror are less
> > + */
> > +static int guess_optimal(struct map_lookup *map, int optimal)
> > +{
> > +	int i;
> > +	int round_down = 8;
> > +	int num = map->num_stripes;  
> 
> num has to be initialized from map->sub_stripes if we're reading
> RAID10, otherwise there will be NULL pointer dereference
> 

Check can be like:
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
	num = map->sub_stripes;

>@@ -5804,10 +5914,12 @@ static int __btrfs_map_block(struct
>btrfs_fs_info *fs_info,
> 			stripe_index += mirror_num - 1;
> 		else {
> 			int old_stripe_index = stripe_index;
>+			optimal = guess_optimal(map,
>+					current->pid %
>map->num_stripes);
> 			stripe_index = find_live_mirror(fs_info, map,
> 					      stripe_index,
> 					      map->sub_stripes,
> stripe_index +
>-					      current->pid %
>map->sub_stripes,
>+					      optimal,
> 					      dev_replace_is_ongoing);
> 			mirror_num = stripe_index - old_stripe_index
> + 1; }
>-- 
>2.15.1

Also here calculation should be with map->sub_stripes too.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic
  2017-12-29 19:14   ` Dmitrii Tcvetkov
@ 2017-12-30  0:15     ` Timofey Titovets
  2017-12-30  8:14       ` Dmitrii Tcvetkov
  0 siblings, 1 reply; 6+ messages in thread
From: Timofey Titovets @ 2017-12-30  0:15 UTC (permalink / raw)
  To: Dmitrii Tcvetkov; +Cc: linux-btrfs

2017-12-29 22:14 GMT+03:00 Dmitrii Tcvetkov <demfloro@demfloro.ru>:
> On Fri, 29 Dec 2017 21:44:19 +0300
> Dmitrii Tcvetkov <demfloro@demfloro.ru> wrote:
>> > +/**
>> > + * guess_optimal - return guessed optimal mirror
>> > + *
>> > + * Optimal expected to be pid % num_stripes
>> > + *
>> > + * That's generaly ok for spread load
>> > + * Add some balancer based on queue leght to device
>> > + *
>> > + * Basic ideas:
>> > + *  - Sequential read generate low amount of request
>> > + *    so if load of drives are equal, use pid % num_stripes
>> > balancing
>> > + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as
>> > optimal
>> > + *    and repick if other dev have "significant" less queue lenght
>> > + *  - Repick optimal if queue leght of other mirror are less
>> > + */
>> > +static int guess_optimal(struct map_lookup *map, int optimal)
>> > +{
>> > +   int i;
>> > +   int round_down = 8;
>> > +   int num = map->num_stripes;
>>
>> num has to be initialized from map->sub_stripes if we're reading
>> RAID10, otherwise there will be NULL pointer dereference
>>
>
> Check can be like:
> if (map->type & BTRFS_BLOCK_GROUP_RAID10)
>         num = map->sub_stripes;
>
>>@@ -5804,10 +5914,12 @@ static int __btrfs_map_block(struct
>>btrfs_fs_info *fs_info,
>>                       stripe_index += mirror_num - 1;
>>               else {
>>                       int old_stripe_index = stripe_index;
>>+                      optimal = guess_optimal(map,
>>+                                      current->pid %
>>map->num_stripes);
>>                       stripe_index = find_live_mirror(fs_info, map,
>>                                             stripe_index,
>>                                             map->sub_stripes,
>> stripe_index +
>>-                                            current->pid %
>>map->sub_stripes,
>>+                                            optimal,
>>                                             dev_replace_is_ongoing);
>>                       mirror_num = stripe_index - old_stripe_index
>> + 1; }
>>--
>>2.15.1
>
> Also here calculation should be with map->sub_stripes too.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Why you think we need such check?
I.e. guess_optimal always called for find_live_mirror()
Both in same context, like that:

if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  u32 factor = map->num_stripes / map->sub_stripes;

  stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
  stripe_index *= map->sub_stripes;

  if (need_full_stripe(op))
    num_stripes = map->sub_stripes;
  else if (mirror_num)
    stripe_index += mirror_num - 1;
  else {
    int old_stripe_index = stripe_index;
    stripe_index = find_live_mirror(fs_info, map,
      stripe_index,
      map->sub_stripes, stripe_index +
      current->pid % map->sub_stripes,
      dev_replace_is_ongoing);
    mirror_num = stripe_index - old_stripe_index + 1;
}

That useless to check that internally

---
Also, fio results for all hdd raid1, results from waxhead:

Original:

Disk-4k-randread-depth-32: (g=0): rw=randread, bs=(R) 4096B-512KiB,
(W) 4096B-512KiB, (T) 4096B-512KiB, ioengine=libaio, iodepth=32
Disk-4k-read-depth-8: (g=0): rw=read, bs=(R) 4096B-512KiB, (W)
4096B-512KiB, (T) 4096B-512KiB, ioengine=libaio, iodepth=8
Disk-4k-randwrite-depth-8: (g=0): rw=randwrite, bs=(R) 4096B-512KiB,
(W) 4096B-512KiB, (T) 4096B-512KiB, ioengine=libaio, iodepth=8
fio-3.1
Starting 3 processes
Disk-4k-randread-depth-32: Laying out IO file (1 file / 65536MiB)
Jobs: 3 (f=3): [r(1),R(1),w(1)][100.0%][r=120MiB/s,w=9.88MiB/s][r=998,w=96
IOPS][eta 00m:00s]
Disk-4k-randread-depth-32: (groupid=0, jobs=1): err= 0: pid=3132: Fri
Dec 29 16:16:33 2017
   read: IOPS=375, BW=41.3MiB/s (43.3MB/s)(24.2GiB/600128msec)
    slat (usec): min=15, max=206039, avg=88.71, stdev=990.35
    clat (usec): min=357, max=3487.1k, avg=85022.93, stdev=141872.25
     lat (usec): min=399, max=3487.2k, avg=85112.58, stdev=141880.31
    clat percentiles (msec):
     |  1.00th=[    5],  5.00th=[    7], 10.00th=[    9], 20.00th=[   13],
     | 30.00th=[   19], 40.00th=[   27], 50.00th=[   39], 60.00th=[   56],
     | 70.00th=[   83], 80.00th=[  127], 90.00th=[  209], 95.00th=[  300],
     | 99.00th=[  600], 99.50th=[  852], 99.90th=[ 1703], 99.95th=[ 2165],
     | 99.99th=[ 2937]
   bw (  KiB/s): min=  392, max=75824, per=30.46%, avg=42736.09,
stdev=12019.09, samples=1186
   iops        : min=    3, max=  500, avg=380.24, stdev=99.50, samples=1186
  lat (usec)   : 500=0.01%, 750=0.01%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.29%, 10=12.33%, 20=19.67%, 50=24.92%
  lat (msec)   : 100=17.51%, 250=18.05%, 500=5.72%, 750=0.85%, 1000=0.28%
  lat (msec)   : 2000=0.29%, >=2000=0.07%
  cpu          : usr=0.67%, sys=4.62%, ctx=215716, majf=0, minf=526
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued rwt: total=225609,0,0, short=0,0,0, dropped=0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=32
Disk-4k-read-depth-8: (groupid=0, jobs=1): err= 0: pid=3133: Fri Dec
29 16:16:33 2017
   read: IOPS=694, BW=95.8MiB/s (100MB/s)(56.1GiB/600017msec)
    slat (usec): min=8, max=617652, avg=88.88, stdev=1996.00
    clat (usec): min=95, max=1127.4k, avg=11424.86, stdev=10606.45
     lat (usec): min=138, max=1127.5k, avg=11514.53, stdev=10796.64
    clat percentiles (usec):
     |  1.00th=[  1270],  5.00th=[  2507], 10.00th=[  3261], 20.00th=[  5932],
     | 30.00th=[  6783], 40.00th=[  7701], 50.00th=[  9896], 60.00th=[ 11076],
     | 70.00th=[ 13435], 80.00th=[ 15795], 90.00th=[ 20841], 95.00th=[ 25822],
     | 99.00th=[ 36963], 99.50th=[ 45351], 99.90th=[108528], 99.95th=[137364],
     | 99.99th=[387974]
   bw (  KiB/s): min=10720, max=131855, per=69.93%, avg=98104.00,
stdev=14476.04, samples=1200
   iops        : min=   78, max= 1082, avg=694.71, stdev=111.69, samples=1200
  lat (usec)   : 100=0.01%, 250=0.04%, 500=0.12%, 750=0.25%, 1000=0.26%
  lat (msec)   : 2=2.02%, 4=11.65%, 10=36.16%, 20=38.45%, 50=10.67%
  lat (msec)   : 100=0.27%, 250=0.11%, 500=0.01%, 750=0.01%, 2000=0.01%
  cpu          : usr=0.78%, sys=7.69%, ctx=264209, majf=0, minf=521
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=100.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.1%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwt: total=416698,0,0, short=0,0,0, dropped=0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=8
Disk-4k-randwrite-depth-8: (groupid=0, jobs=1): err= 0: pid=3134: Fri
Dec 29 16:16:33 2017
  write: IOPS=81, BW=10.6MiB/s (11.1MB/s)(6362MiB/600133msec)
    slat (usec): min=16, max=429897, avg=98.81, stdev=3109.35
    clat (usec): min=240, max=2206.9k, avg=98358.53, stdev=309465.43
     lat (usec): min=305, max=2206.0k, avg=98458.24, stdev=309483.50
    clat percentiles (usec):
     |  1.00th=[   1237],  5.00th=[   3326], 10.00th=[   5080],
     | 20.00th=[   7635], 30.00th=[  10159], 40.00th=[  12911],
     | 50.00th=[  16319], 60.00th=[  21890], 70.00th=[  36439],
     | 80.00th=[  91751], 90.00th=[ 166724], 95.00th=[ 287310],
     | 99.00th=[2021655], 99.50th=[2038432], 99.90th=[2088764],
     | 99.95th=[2105541], 99.99th=[2164261]
   bw (  KiB/s): min=    8, max=91796, per=100.00%, avg=16619.21,
stdev=18128.65, samples=797
   iops        : min=    2, max=  640, avg=123.92, stdev=132.83, samples=797
  lat (usec)   : 250=0.01%, 500=0.17%, 750=0.19%, 1000=0.33%
  lat (msec)   : 2=1.34%, 4=5.01%, 10=22.34%, 20=28.22%, 50=15.76%
  lat (msec)   : 100=8.23%, 250=12.71%, 500=2.37%, 750=0.51%, 1000=0.15%
  lat (msec)   : 2000=1.05%, >=2000=1.62%
  cpu          : usr=0.20%, sys=0.72%, ctx=41618, majf=0, minf=7
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=100.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.1%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwt: total=0,48759,0, short=0,0,0, dropped=0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=8

Run status group 0 (all jobs):
   READ: bw=137MiB/s (144MB/s), 41.3MiB/s-95.8MiB/s
(43.3MB/s-100MB/s), io=80.3GiB (86.2GB), run=600017-600128msec
  WRITE: bw=10.6MiB/s (11.1MB/s), 10.6MiB/s-10.6MiB/s
(11.1MB/s-11.1MB/s), io=6362MiB (6671MB), run=600133-600133msec

Patched:
Disk-4k-randread-depth-32: (g=0): rw=randread, bs=(R) 4096B-512KiB,
(W) 4096B-512KiB, (T) 4096B-512KiB, ioengine=libaio, iodepth=32
Disk-4k-read-depth-8: (g=0): rw=read, bs=(R) 4096B-512KiB, (W)
4096B-512KiB, (T) 4096B-512KiB, ioengine=libaio, iodepth=8
Disk-4k-randwrite-depth-8: (g=0): rw=randwrite, bs=(R) 4096B-512KiB,
(W) 4096B-512KiB, (T) 4096B-512KiB, ioengine=libaio, iodepth=8
fio-3.1
Starting 3 processes
Jobs: 3 (f=3): [r(1),R(1),w(1)][100.0%][r=67.3MiB/s,w=17.7MiB/s][r=734,w=150
IOPS][eta 00m:00s]
Disk-4k-randread-depth-32: (groupid=0, jobs=1): err= 0: pid=1755: Fri
Dec 29 22:56:57 2017
   read: IOPS=613, BW=60.6MiB/s (63.5MB/s)(35.5GiB/600060msec)
    slat (usec): min=12, max=237473, avg=163.70, stdev=1695.77
    clat (usec): min=220, max=1152.1k, avg=51952.45, stdev=56779.39
     lat (usec): min=263, max=1152.3k, avg=52117.15, stdev=56934.65
    clat percentiles (msec):
     |  1.00th=[    5],  5.00th=[    7], 10.00th=[   10], 20.00th=[   14],
     | 30.00th=[   19], 40.00th=[   25], 50.00th=[   33], 60.00th=[   43],
     | 70.00th=[   57], 80.00th=[   80], 90.00th=[  121], 95.00th=[  165],
     | 99.00th=[  271], 99.50th=[  326], 99.90th=[  456], 99.95th=[  502],
     | 99.99th=[  651]
   bw (  KiB/s): min= 7006, max=106682, per=60.69%, avg=62211.51,
stdev=14166.42, samples=1199
   iops        : min=   72, max=  825, avg=615.08, stdev=106.32, samples=1199
  lat (usec)   : 250=0.01%, 500=0.01%, 750=0.01%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.72%, 10=11.63%, 20=20.86%, 50=32.26%
  lat (msec)   : 100=20.54%, 250=12.60%, 500=1.31%, 750=0.05%, 1000=0.01%
  lat (msec)   : 2000=0.01%
  cpu          : usr=1.14%, sys=7.37%, ctx=333462, majf=0, minf=528
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=0.1%, 32=100.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%, >=64=0.0%
     issued rwt: total=368214,0,0, short=0,0,0, dropped=0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=32
Disk-4k-read-depth-8: (groupid=0, jobs=1): err= 0: pid=1756: Fri Dec
29 22:56:57 2017
   read: IOPS=285, BW=39.5MiB/s (41.4MB/s)(23.2GiB/600056msec)
    slat (usec): min=7, max=523518, avg=115.85, stdev=2072.83
    clat (usec): min=90, max=1880.8k, avg=27860.58, stdev=49717.09
     lat (usec): min=127, max=1880.9k, avg=27977.32, stdev=49780.09
    clat percentiles (usec):
     |  1.00th=[   469],  5.00th=[  1074], 10.00th=[  1762], 20.00th=[  3654],
     | 30.00th=[  5866], 40.00th=[  7767], 50.00th=[ 10159], 60.00th=[ 13829],
     | 70.00th=[ 20841], 80.00th=[ 35390], 90.00th=[ 76022], 95.00th=[124257],
     | 99.00th=[229639], 99.50th=[304088], 99.90th=[484443], 99.95th=[591397],
     | 99.99th=[742392]
   bw (  KiB/s): min=  672, max=100966, per=39.58%, avg=40570.62,
stdev=17431.49, samples=1197
   iops        : min=   17, max=  744, avg=286.45, stdev=126.09, samples=1197
  lat (usec)   : 100=0.01%, 250=0.31%, 500=0.97%, 750=1.79%, 1000=1.49%
  lat (msec)   : 2=7.55%, 4=8.70%, 10=28.45%, 20=19.95%, 50=15.85%
  lat (msec)   : 100=7.67%, 250=6.49%, 500=0.69%, 750=0.08%, 1000=0.01%
  lat (msec)   : 2000=0.01%
  cpu          : usr=0.39%, sys=3.36%, ctx=130493, majf=0, minf=524
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=100.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.1%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwt: total=171546,0,0, short=0,0,0, dropped=0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=8
Disk-4k-randwrite-depth-8: (groupid=0, jobs=1): err= 0: pid=1757: Fri
Dec 29 22:56:57 2017
  write: IOPS=136, BW=17.2MiB/s (18.0MB/s)(10.1GiB/600007msec)
    slat (usec): min=19, max=136510, avg=114.43, stdev=1121.55
    clat (usec): min=258, max=2084.5k, avg=58607.26, stdev=103204.91
     lat (usec): min=334, max=2084.7k, avg=58722.67, stdev=103202.00
    clat percentiles (msec):
     |  1.00th=[    3],  5.00th=[    6], 10.00th=[    8], 20.00th=[   12],
     | 30.00th=[   16], 40.00th=[   21], 50.00th=[   27], 60.00th=[   37],
     | 70.00th=[   53], 80.00th=[   80], 90.00th=[  131], 95.00th=[  205],
     | 99.00th=[  506], 99.50th=[  718], 99.90th=[ 1183], 99.95th=[ 1385],
     | 99.99th=[ 1989]
 bw (  KiB/s): min=    8, max=61572, per=100.00%, avg=18098.74,
stdev=11881.08, samples=1175
   iops        : min=    2, max=  495, avg=139.63, stdev=92.08, samples=1175
  lat (usec)   : 500=0.06%, 750=0.06%, 1000=0.13%
  lat (msec)   : 2=0.55%, 4=2.32%, 10=13.21%, 20=23.07%, 50=29.51%
  lat (msec)   : 100=16.15%, 250=11.65%, 500=2.26%, 750=0.59%, 1000=0.27%
  lat (msec)   : 2000=0.16%, >=2000=0.01%
  cpu          : usr=0.38%, sys=1.35%, ctx=77040, majf=0, minf=9
  IO depths    : 1=0.1%, 2=0.1%, 4=0.1%, 8=100.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.1%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwt: total=0,81727,0, short=0,0,0, dropped=0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=8

Run status group 0 (all jobs):
   READ: bw=100MiB/s (105MB/s), 39.5MiB/s-60.6MiB/s
(41.4MB/s-63.5MB/s), io=58.7GiB (62.0GB), run=600056-600060msec
  WRITE: bw=17.2MiB/s (18.0MB/s), 17.2MiB/s-17.2MiB/s
(18.0MB/s-18.0MB/s), io=10.1GiB (10.8GB), run=600007-600007msec

So, as you can observe, with mixed load,
that rebalance iops to random read, decrease random write/read latency.
But make sequential thread a little bit hungry,

For systems with less random load, sequential read performance must not change.
(i.e. if load below threshold of queue length for hdd).


Thanks.

-- 
Have a nice day,
Timofey.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic
  2017-12-30  0:15     ` Timofey Titovets
@ 2017-12-30  8:14       ` Dmitrii Tcvetkov
  2017-12-30  9:47         ` Timofey Titovets
  0 siblings, 1 reply; 6+ messages in thread
From: Dmitrii Tcvetkov @ 2017-12-30  8:14 UTC (permalink / raw)
  To: Timofey Titovets; +Cc: linux-btrfs

On Sat, 30 Dec 2017 03:15:20 +0300
Timofey Titovets <nefelim4ag@gmail.com> wrote:

> 2017-12-29 22:14 GMT+03:00 Dmitrii Tcvetkov <demfloro@demfloro.ru>:
> > On Fri, 29 Dec 2017 21:44:19 +0300
> > Dmitrii Tcvetkov <demfloro@demfloro.ru> wrote:  
> >> > +/**
> >> > + * guess_optimal - return guessed optimal mirror
> >> > + *
> >> > + * Optimal expected to be pid % num_stripes
> >> > + *
> >> > + * That's generaly ok for spread load
> >> > + * Add some balancer based on queue leght to device
> >> > + *
> >> > + * Basic ideas:
> >> > + *  - Sequential read generate low amount of request
> >> > + *    so if load of drives are equal, use pid % num_stripes
> >> > balancing
> >> > + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as
> >> > optimal
> >> > + *    and repick if other dev have "significant" less queue
> >> > lenght
> >> > + *  - Repick optimal if queue leght of other mirror are less
> >> > + */
> >> > +static int guess_optimal(struct map_lookup *map, int optimal)
> >> > +{
> >> > +   int i;
> >> > +   int round_down = 8;
> >> > +   int num = map->num_stripes;  
> >>
> >> num has to be initialized from map->sub_stripes if we're reading
> >> RAID10, otherwise there will be NULL pointer dereference
> >>  
> >
> > Check can be like:
> > if (map->type & BTRFS_BLOCK_GROUP_RAID10)
> >         num = map->sub_stripes;
> >  
> >>@@ -5804,10 +5914,12 @@ static int __btrfs_map_block(struct
> >>btrfs_fs_info *fs_info,
> >>                       stripe_index += mirror_num - 1;
> >>               else {
> >>                       int old_stripe_index = stripe_index;
> >>+                      optimal = guess_optimal(map,
> >>+                                      current->pid %
> >>map->num_stripes);
> >>                       stripe_index = find_live_mirror(fs_info, map,
> >>                                             stripe_index,
> >>                                             map->sub_stripes,
> >> stripe_index +
> >>-                                            current->pid %
> >>map->sub_stripes,
> >>+                                            optimal,
> >>                                             dev_replace_is_ongoing);
> >>                       mirror_num = stripe_index - old_stripe_index
> >> + 1; }
> >>--
> >>2.15.1  
> >
> > Also here calculation should be with map->sub_stripes too.
> > --
> > To unsubscribe from this list: send the line "unsubscribe
> > linux-btrfs" in the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html  
> 
> Why you think we need such check?
> I.e. guess_optimal always called for find_live_mirror()
> Both in same context, like that:
> 
> if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
>   u32 factor = map->num_stripes / map->sub_stripes;
> 
>   stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
>   stripe_index *= map->sub_stripes;
> 
>   if (need_full_stripe(op))
>     num_stripes = map->sub_stripes;
>   else if (mirror_num)
>     stripe_index += mirror_num - 1;
>   else {
>     int old_stripe_index = stripe_index;
>     stripe_index = find_live_mirror(fs_info, map,
>       stripe_index,
>       map->sub_stripes, stripe_index +
>       current->pid % map->sub_stripes,
>       dev_replace_is_ongoing);
>     mirror_num = stripe_index - old_stripe_index + 1;
> }
> 
> That useless to check that internally

My bad, so only need to call 
guess_optimal(map, current->pid % map->sub_stripes)
in RAID10 branch.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic
  2017-12-30  8:14       ` Dmitrii Tcvetkov
@ 2017-12-30  9:47         ` Timofey Titovets
  0 siblings, 0 replies; 6+ messages in thread
From: Timofey Titovets @ 2017-12-30  9:47 UTC (permalink / raw)
  To: Dmitrii Tcvetkov; +Cc: linux-btrfs

2017-12-30 11:14 GMT+03:00 Dmitrii Tcvetkov <demfloro@demfloro.ru>:
> On Sat, 30 Dec 2017 03:15:20 +0300
> Timofey Titovets <nefelim4ag@gmail.com> wrote:
>
>> 2017-12-29 22:14 GMT+03:00 Dmitrii Tcvetkov <demfloro@demfloro.ru>:
>> > On Fri, 29 Dec 2017 21:44:19 +0300
>> > Dmitrii Tcvetkov <demfloro@demfloro.ru> wrote:
>> >> > +/**
>> >> > + * guess_optimal - return guessed optimal mirror
>> >> > + *
>> >> > + * Optimal expected to be pid % num_stripes
>> >> > + *
>> >> > + * That's generaly ok for spread load
>> >> > + * Add some balancer based on queue leght to device
>> >> > + *
>> >> > + * Basic ideas:
>> >> > + *  - Sequential read generate low amount of request
>> >> > + *    so if load of drives are equal, use pid % num_stripes
>> >> > balancing
>> >> > + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as
>> >> > optimal
>> >> > + *    and repick if other dev have "significant" less queue
>> >> > lenght
>> >> > + *  - Repick optimal if queue leght of other mirror are less
>> >> > + */
>> >> > +static int guess_optimal(struct map_lookup *map, int optimal)
>> >> > +{
>> >> > +   int i;
>> >> > +   int round_down = 8;
>> >> > +   int num = map->num_stripes;
>> >>
>> >> num has to be initialized from map->sub_stripes if we're reading
>> >> RAID10, otherwise there will be NULL pointer dereference
>> >>
>> >
>> > Check can be like:
>> > if (map->type & BTRFS_BLOCK_GROUP_RAID10)
>> >         num = map->sub_stripes;
>> >
>> >>@@ -5804,10 +5914,12 @@ static int __btrfs_map_block(struct
>> >>btrfs_fs_info *fs_info,
>> >>                       stripe_index += mirror_num - 1;
>> >>               else {
>> >>                       int old_stripe_index = stripe_index;
>> >>+                      optimal = guess_optimal(map,
>> >>+                                      current->pid %
>> >>map->num_stripes);
>> >>                       stripe_index = find_live_mirror(fs_info, map,
>> >>                                             stripe_index,
>> >>                                             map->sub_stripes,
>> >> stripe_index +
>> >>-                                            current->pid %
>> >>map->sub_stripes,
>> >>+                                            optimal,
>> >>                                             dev_replace_is_ongoing);
>> >>                       mirror_num = stripe_index - old_stripe_index
>> >> + 1; }
>> >>--
>> >>2.15.1
>> >
>> > Also here calculation should be with map->sub_stripes too.
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe
>> > linux-btrfs" in the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>> Why you think we need such check?
>> I.e. guess_optimal always called for find_live_mirror()
>> Both in same context, like that:
>>
>> if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
>>   u32 factor = map->num_stripes / map->sub_stripes;
>>
>>   stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
>>   stripe_index *= map->sub_stripes;
>>
>>   if (need_full_stripe(op))
>>     num_stripes = map->sub_stripes;
>>   else if (mirror_num)
>>     stripe_index += mirror_num - 1;
>>   else {
>>     int old_stripe_index = stripe_index;
>>     stripe_index = find_live_mirror(fs_info, map,
>>       stripe_index,
>>       map->sub_stripes, stripe_index +
>>       current->pid % map->sub_stripes,
>>       dev_replace_is_ongoing);
>>     mirror_num = stripe_index - old_stripe_index + 1;
>> }
>>
>> That useless to check that internally
>
> My bad, so only need to call
> guess_optimal(map, current->pid % map->sub_stripes)
> in RAID10 branch.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Yes, my bad, copy-paste error, will be fixed in v3

Thanks

-- 
Have a nice day,
Timofey.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-12-30  9:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-29  2:09 [PATCH v2] Btrfs: enchanse raid1/10 balance heuristic Timofey Titovets
2017-12-29 18:44 ` Dmitrii Tcvetkov
2017-12-29 19:14   ` Dmitrii Tcvetkov
2017-12-30  0:15     ` Timofey Titovets
2017-12-30  8:14       ` Dmitrii Tcvetkov
2017-12-30  9:47         ` Timofey Titovets

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).