linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Anand Jain <anand.jain@oracle.com>
To: linux-btrfs@vger.kernel.org
Cc: dsterba@suse.com, wqu@suse.com, hrx@bupt.moe, waxhead@dirtcellar.net
Subject: [PATCH v2 1/3] btrfs: introduce RAID1 round-robin read balancing
Date: Fri, 11 Oct 2024 10:49:16 +0800	[thread overview]
Message-ID: <ae88febb2b06eeadeafe97a476b92b66982ab2c7.1728608421.git.anand.jain@oracle.com> (raw)
In-Reply-To: <cover.1728608421.git.anand.jain@oracle.com>

This feature balances I/O across the striped devices when reading from
RAID1 blocks.

   echo rotation:[min_contiguous_read] > /sys/fs/btrfs/<uuid>/read_policy

Default value of min_contiguous_read is equal to the sectorsize.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/disk-io.c |  3 ++
 fs/btrfs/sysfs.c   | 88 ++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/volumes.c | 53 ++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  9 +++++
 4 files changed, 138 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ad5db619b00..5b157f407e0a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3320,6 +3320,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 
 	fs_info->nodesize = nodesize;
 	fs_info->sectorsize = sectorsize;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	fs_info->fs_devices->min_contiguous_read = sectorsize;
+#endif
 	fs_info->sectorsize_bits = ilog2(sectorsize);
 	fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b843308e2bc6..bacb2871109b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1305,7 +1305,11 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static const char * const btrfs_read_policy_name[] = { "pid", "rotation" };
+#else
 static const char * const btrfs_read_policy_name[] = { "pid" };
+#endif
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 				      struct kobj_attribute *a, char *buf)
@@ -1316,14 +1320,22 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
 	int i;
 
 	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
-		if (policy == i)
-			ret += sysfs_emit_at(buf, ret, "%s[%s]",
-					 (ret == 0 ? "" : " "),
-					 btrfs_read_policy_name[i]);
-		else
-			ret += sysfs_emit_at(buf, ret, "%s%s",
-					 (ret == 0 ? "" : " "),
-					 btrfs_read_policy_name[i]);
+		if (ret != 0)
+			ret += sysfs_emit_at(buf, ret, " ");
+
+		if (i == policy)
+			ret += sysfs_emit_at(buf, ret, "[");
+
+		ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+		if (i == BTRFS_READ_POLICY_ROTATION)
+			ret += sysfs_emit_at(buf, ret, ":%d",
+					     fs_devices->min_contiguous_read);
+#endif
+
+		if (i == policy)
+			ret += sysfs_emit_at(buf, ret, "]");
 	}
 
 	ret += sysfs_emit_at(buf, ret, "\n");
@@ -1336,21 +1348,67 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
 				       const char *buf, size_t len)
 {
 	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+	int index = -1;
 	int i;
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	char *value = strchr(buf, ':');
+
+	/* Separate value from input in policy:value format. */
+	if (value) {
+		*value = '\0';
+		value++;
+	}
+#endif
+
 	for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
 		if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
-			if (i != READ_ONCE(fs_devices->read_policy)) {
-				WRITE_ONCE(fs_devices->read_policy, i);
-				btrfs_info(fs_devices->fs_info,
-					   "read policy set to '%s'",
-					   btrfs_read_policy_name[i]);
+			index = i;
+			break;
+		}
+	}
+
+	if (index == -1)
+		return -EINVAL;
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	if (index == BTRFS_READ_POLICY_ROTATION) {
+		int value_rota = fs_devices->fs_info->sectorsize;
+
+		if (value) {
+			if (kstrtoint(value, 10, &value_rota))
+				return -EINVAL;
+
+			if (value_rota % fs_devices->fs_info->sectorsize != 0) {
+				btrfs_err(fs_devices->fs_info,
+"read_policy: min_contiguous_read %d should be multiples of the sectorsize %u",
+					  value_rota,
+					  fs_devices->fs_info->sectorsize);
+				return -EINVAL;
 			}
-			return len;
 		}
+
+		if (index != READ_ONCE(fs_devices->read_policy) ||
+		    value_rota != READ_ONCE(fs_devices->min_contiguous_read)) {
+			WRITE_ONCE(fs_devices->read_policy, index);
+			WRITE_ONCE(fs_devices->min_contiguous_read, value_rota);
+			atomic_set(&fs_devices->total_reads, 0);
+
+			btrfs_info(fs_devices->fs_info, "read policy set to '%s:%d'",
+				   btrfs_read_policy_name[index], value_rota);
+
+		}
+
+		return len;
+	}
+#endif
+	if (index != READ_ONCE(fs_devices->read_policy)) {
+		WRITE_ONCE(fs_devices->read_policy, index);
+		btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
+			   btrfs_read_policy_name[index]);
 	}
 
-	return -EINVAL;
+	return len;
 }
 BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dc9f54849f39..ec5dbe69ba2c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5962,6 +5962,54 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 	return len;
 }
 
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+struct stripe_mirror {
+	u64 devid;
+	int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+	struct stripe_mirror *s1 = (struct stripe_mirror *)a;
+	struct stripe_mirror *s2 = (struct stripe_mirror *)b;
+
+	if (s1->devid < s2->devid)
+		return -1;
+	if (s1->devid > s2->devid)
+		return 1;
+	return 0;
+}
+
+static int btrfs_read_rotation(struct btrfs_chunk_map *map, int first,
+			       int num_stripe)
+{
+	struct stripe_mirror stripes[4] = {0}; //4: max possible mirrors
+	struct btrfs_fs_devices *fs_devices = map->stripes[first].dev->fs_devices;
+	int j;
+	int slot;
+	int index;
+	int ret_stripe;
+	int total_reads;
+	int reads_per_dev = fs_devices->min_contiguous_read/
+						fs_devices->fs_info->sectorsize;
+
+	index = 0;
+	for (j = first; j < first + num_stripe; j++) {
+		stripes[index].devid = map->stripes[j].dev->devid;
+		stripes[index].num = j;
+		index++;
+	}
+	sort(stripes, num_stripe, sizeof(struct stripe_mirror),
+	     btrfs_cmp_devid, NULL);
+
+	total_reads = atomic_inc_return(&fs_devices->total_reads);
+	slot = total_reads/reads_per_dev;
+	ret_stripe = stripes[slot % num_stripe].num;
+
+	return ret_stripe;
+}
+#endif
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct btrfs_chunk_map *map, int first,
 			    int dev_replace_is_ongoing)
@@ -5991,6 +6039,11 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_PID:
 		preferred_mirror = first + (current->pid % num_stripes);
 		break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	case BTRFS_READ_POLICY_ROTATION:
+		preferred_mirror = btrfs_read_rotation(map, first, num_stripes);
+		break;
+#endif
 	}
 
 	if (dev_replace_is_ongoing &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3a416b1bc24c..0db754a4b13d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -303,6 +303,10 @@ enum btrfs_chunk_allocation_policy {
 enum btrfs_read_policy {
 	/* Use process PID to choose the stripe */
 	BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* Balancing raid1 reads across all striped devices */
+	BTRFS_READ_POLICY_ROTATION,
+#endif
 	BTRFS_NR_READ_POLICY,
 };
 
@@ -431,6 +435,11 @@ struct btrfs_fs_devices {
 	enum btrfs_read_policy read_policy;
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/* IO stat, read counter. */
+	atomic_t total_reads;
+	/* Min contiguous reads before switching to next device. */
+	int min_contiguous_read;
+
 	/* Checksum mode - offload it or do it synchronously. */
 	enum btrfs_offload_csum_mode offload_csum_mode;
 #endif
-- 
2.46.1


  reply	other threads:[~2024-10-11  2:49 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-11  2:49 [PATCH v2 0/3] raid1 balancing methods Anand Jain
2024-10-11  2:49 ` Anand Jain [this message]
2024-10-11  2:49 ` [PATCH v2 2/3] btrfs: use the path with the lowest latency for RAID1 reads Anand Jain
2024-10-11  2:49 ` [PATCH v2 3/3] btrfs: add RAID1 preferred read device Anand Jain
2024-10-11  3:35 ` [PATCH v2 0/3] raid1 balancing methods Anand Jain
2024-10-11  4:59 ` Qu Wenruo
2024-10-11  6:04   ` Anand Jain
2024-10-21 14:05 ` David Sterba
2024-10-21 15:36   ` Anand Jain
2024-10-21 18:42     ` David Sterba
2024-10-22  0:31       ` Anand Jain
2024-10-21 14:32 ` waxhead
2024-10-21 15:44   ` Anand Jain
2024-10-22  7:07   ` Johannes Thumshirn
2024-10-24  4:39 ` Qu Wenruo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ae88febb2b06eeadeafe97a476b92b66982ab2c7.1728608421.git.anand.jain@oracle.com \
    --to=anand.jain@oracle.com \
    --cc=dsterba@suse.com \
    --cc=hrx@bupt.moe \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=waxhead@dirtcellar.net \
    --cc=wqu@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).