public inbox for linux-bcachefs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Split brain detection
@ 2023-11-02  2:49 Kent Overstreet
  2023-11-02  2:49 ` [PATCH 1/2] bcachefs: bch_member->seq Kent Overstreet
  2023-11-02  2:49 ` [PATCH 2/2] bcachefs: Split brain detection Kent Overstreet
  0 siblings, 2 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-11-02  2:49 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Two patch series for split brain detection: that is, detect when
different members of a filesystem have been used, independently, in
degraded mode and diverged - when that happens we can no longer use
those members in combination.

First patch adds superblock facilities: we add a superblock sequence
number field to bch_member, and a write time field to the superblock to
detect cases when sb->seq is equal but they actually have diverged.

Second patch does the actual split brain detection and kicking out of
devices when necessary.

Kent Overstreet (2):
  bcachefs: bch_member->seq
  bcachefs: Split brain detection

 fs/bcachefs/bcachefs_format.h |  4 ++-
 fs/bcachefs/errcode.h         |  1 +
 fs/bcachefs/sb-members.c      |  5 ++++
 fs/bcachefs/super-io.c        | 10 +++++++
 fs/bcachefs/super.c           | 56 ++++++++++++++++++++++++++++-------
 5 files changed, 64 insertions(+), 12 deletions(-)

-- 
2.42.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] bcachefs: bch_member->seq
  2023-11-02  2:49 [PATCH 0/2] Split brain detection Kent Overstreet
@ 2023-11-02  2:49 ` Kent Overstreet
  2023-11-02  2:49 ` [PATCH 2/2] bcachefs: Split brain detection Kent Overstreet
  1 sibling, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-11-02  2:49 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Add new fields for split brain detection:

 - bch_member->seq, which tracks the sequence number of the last superblock
   write that happened to each member device

 - bch_sb->write_time, which tracks the time of the last superblock write,
   to allow detection of when two members have diverged but had the same
   number of superblock writes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  4 +++-
 fs/bcachefs/sb-members.c      |  5 +++++
 fs/bcachefs/super-io.c        | 10 ++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5b44598b9df9..b22868ca9770 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1294,6 +1294,7 @@ struct bch_member {
 	__le64			errors[BCH_MEMBER_ERROR_NR];
 	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
 	__le64			errors_reset_time;
+	__le64			seq;
 };
 
 #define BCH_MEMBER_V1_BYTES	56
@@ -1761,7 +1762,8 @@ struct bch_sb {
 	__le32			time_base_hi;
 	__le32			time_precision;
 
-	__le64			flags[8];
+	__le64			flags[7];
+	__le64			write_time;
 	__le64			features[2];
 	__le64			compat[2];
 
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 6a7e20de971c..4c6908cfca05 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out,
 		prt_printf(out, "(never)");
 	prt_newline(out);
 
+	prt_printf(out, "Last superblock write:");
+	prt_tab(out);
+	prt_u64(out, le64_to_cpu(m.seq));
+	prt_newline(out);
+
 	prt_printf(out, "State:");
 	prt_tab(out);
 	prt_printf(out, "%s",
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index a93e53d0b37e..3ebe14c806ac 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -890,6 +890,11 @@ int bch2_write_super(struct bch_fs *c)
 
 	le64_add_cpu(&c->disk_sb.sb->seq, 1);
 
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	for_each_online_member(ca, c, i)
+		__bch2_members_v2_get_mut(mi, i)->seq = c->disk_sb.sb->seq;
+	c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
 	if (test_bit(BCH_FS_ERROR, &c->flags))
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
 	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
@@ -1193,6 +1198,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
 	prt_newline(out);
 
+	prt_printf(out, "Time of last write:");
+	prt_tab(out);
+	bch2_prt_date_seconds(out, le64_to_cpu(sb->write_time));
+	prt_newline(out);
+
 	prt_printf(out, "Superblock size:");
 	prt_tab(out);
 	prt_printf(out, "%zu", vstruct_bytes(sb));
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] bcachefs: Split brain detection
  2023-11-02  2:49 [PATCH 0/2] Split brain detection Kent Overstreet
  2023-11-02  2:49 ` [PATCH 1/2] bcachefs: bch_member->seq Kent Overstreet
@ 2023-11-02  2:49 ` Kent Overstreet
  1 sibling, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-11-02  2:49 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Use the new bch_member->seq, sb->write_time fields to detect split brain
and kick out devices when necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h |  1 +
 fs/bcachefs/super.c   | 56 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 2a11f32cf30a..e1f733eeb7f0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -169,6 +169,7 @@
 	x(EINVAL,			device_size_too_small)			\
 	x(EINVAL,			device_not_a_member_of_filesystem)	\
 	x(EINVAL,			device_has_been_removed)		\
+	x(EINVAL,			device_splitbrain)			\
 	x(EINVAL,			device_already_online)			\
 	x(EINVAL,			insufficient_devices_to_start)		\
 	x(EINVAL,			invalid)				\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 24672bb31cbe..e6b72ff06f56 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1012,20 +1012,46 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 	return 0;
 }
 
-static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+			  struct bch_sb_handle *sb)
 {
-	struct bch_sb *newest =
-		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+	if (fs == sb)
+		return 0;
 
-	if (!uuid_equal(&fs->uuid, &sb->uuid))
+	if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
 		return -BCH_ERR_device_not_a_member_of_filesystem;
 
-	if (!bch2_dev_exists(newest, sb->dev_idx))
+	if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
 		return -BCH_ERR_device_has_been_removed;
 
-	if (fs->block_size != sb->block_size)
+	if (fs->sb->block_size != sb->sb->block_size)
 		return -BCH_ERR_mismatched_block_size;
 
+	if (fs->sb->seq == sb->sb->seq &&
+	    fs->sb->write_time != sb->sb->write_time) {
+		pr_err("Split brain detected between %pg and %pg:\n"
+		       "seq (%llu) equal but write_time does not match\n"
+		       "Not using older sb %pg",
+		       sb->bdev, fs->bdev,
+		       le64_to_cpu(sb->sb->seq), sb->bdev);
+		return -BCH_ERR_device_splitbrain;
+	}
+
+	struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+	u64 seq_from_fs		= le64_to_cpu(m.seq);
+	u64 seq_from_member	= le64_to_cpu(sb->sb->seq);
+
+	if (seq_from_fs && seq_from_fs < seq_from_member) {
+		pr_err("Split brain detected between %pg and %pg:\n"
+		       "%pg believes seq of %pg to be %llu, but %pg has %llu\n"
+		       "Not using %pg",
+		       sb->bdev, fs->bdev,
+		       fs->bdev, sb->bdev, seq_from_fs,
+		       sb->bdev, seq_from_member,
+		       sb->bdev);
+		return -BCH_ERR_device_splitbrain;
+	}
+
 	return 0;
 }
 
@@ -1734,7 +1760,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	dev_idx = sb.sb->dev_idx;
 
-	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	ret = bch2_dev_in_fs(&c->disk_sb, &sb);
 	if (ret) {
 		bch_err_msg(c, ret, "bringing %s online", path);
 		goto err;
@@ -1882,6 +1908,12 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 
 /* Filesystem open: */
 
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+	return  cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+		cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 			    struct bch_opts opts)
 {
@@ -1914,19 +1946,21 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	}
 
 	darray_for_each(sbs, sb)
-		if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+		if (!best || sb_cmp(sb->sb, best->sb) > 0)
 			best = sb;
 
 	darray_for_each_reverse(sbs, sb) {
-		if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
-			pr_info("%pg has been removed, skipping", sb->bdev);
+		ret = bch2_dev_in_fs(best, sb);
+
+		if (ret == -BCH_ERR_device_has_been_removed ||
+		    ret == -BCH_ERR_device_splitbrain) {
 			bch2_free_super(sb);
 			darray_remove_item(&sbs, sb);
 			best -= best > sb;
+			ret = 0;
 			continue;
 		}
 
-		ret = bch2_dev_in_fs(best->sb, sb->sb);
 		if (ret)
 			goto err_print;
 	}
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-11-02  2:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-02  2:49 [PATCH 0/2] Split brain detection Kent Overstreet
2023-11-02  2:49 ` [PATCH 1/2] bcachefs: bch_member->seq Kent Overstreet
2023-11-02  2:49 ` [PATCH 2/2] bcachefs: Split brain detection Kent Overstreet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox