All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Split brain detection
@ 2023-11-02  2:49 Kent Overstreet
  2023-11-02  2:49 ` [PATCH 1/2] bcachefs: bch_member->seq Kent Overstreet
  2023-11-02  2:49 ` [PATCH 2/2] bcachefs: Split brain detection Kent Overstreet
  0 siblings, 2 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-11-02  2:49 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Two patch series for split brain detection: that is, detect when
different members of a filesystem have been used, independently, in
degraded mode and diverged - when that happens we can no longer use
those members in combination.

First patch adds superblock facilities: we add a superblock sequence
number field to bch_member, and a write time field to the superblock to
detect cases when sb->seq is equal but they actually have diverged.

Second patch does the actual split brain detection and kicking out of
devices when necessary.

Kent Overstreet (2):
  bcachefs: bch_member->seq
  bcachefs: Split brain detection

 fs/bcachefs/bcachefs_format.h |  4 ++-
 fs/bcachefs/errcode.h         |  1 +
 fs/bcachefs/sb-members.c      |  5 ++++
 fs/bcachefs/super-io.c        | 10 +++++++
 fs/bcachefs/super.c           | 56 ++++++++++++++++++++++++++++-------
 5 files changed, 64 insertions(+), 12 deletions(-)

-- 
2.42.0


^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] bcachefs: bch_member->seq
  2023-11-02  2:49 [PATCH 0/2] Split brain detection Kent Overstreet
@ 2023-11-02  2:49 ` Kent Overstreet
  2023-11-02  2:49 ` [PATCH 2/2] bcachefs: Split brain detection Kent Overstreet
  1 sibling, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-11-02  2:49 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Add new fields for split brain detection:

 - bch_member->seq, which tracks the sequence number of the last superblock
   write that happened to each member device

 - bch_sb->write_time, which tracks the time of the last superblock write,
   to allow detection of when two members have diverged but had the same
   number of superblock writes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  4 +++-
 fs/bcachefs/sb-members.c      |  5 +++++
 fs/bcachefs/super-io.c        | 10 ++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5b44598b9df9..b22868ca9770 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1294,6 +1294,7 @@ struct bch_member {
 	__le64			errors[BCH_MEMBER_ERROR_NR];
 	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
 	__le64			errors_reset_time;
+	__le64			seq;
 };
 
 #define BCH_MEMBER_V1_BYTES	56
@@ -1761,7 +1762,8 @@ struct bch_sb {
 	__le32			time_base_hi;
 	__le32			time_precision;
 
-	__le64			flags[8];
+	__le64			flags[7];
+	__le64			write_time;
 	__le64			features[2];
 	__le64			compat[2];
 
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 6a7e20de971c..4c6908cfca05 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out,
 		prt_printf(out, "(never)");
 	prt_newline(out);
 
+	prt_printf(out, "Last superblock write:");
+	prt_tab(out);
+	prt_u64(out, le64_to_cpu(m.seq));
+	prt_newline(out);
+
 	prt_printf(out, "State:");
 	prt_tab(out);
 	prt_printf(out, "%s",
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index a93e53d0b37e..3ebe14c806ac 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -890,6 +890,11 @@ int bch2_write_super(struct bch_fs *c)
 
 	le64_add_cpu(&c->disk_sb.sb->seq, 1);
 
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	for_each_online_member(ca, c, i)
+		__bch2_members_v2_get_mut(mi, i)->seq = c->disk_sb.sb->seq;
+	c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
 	if (test_bit(BCH_FS_ERROR, &c->flags))
 		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
 	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
@@ -1193,6 +1198,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
 	prt_newline(out);
 
+	prt_printf(out, "Time of last write:");
+	prt_tab(out);
+	bch2_prt_date_seconds(out, le64_to_cpu(sb->write_time));
+	prt_newline(out);
+
 	prt_printf(out, "Superblock size:");
 	prt_tab(out);
 	prt_printf(out, "%zu", vstruct_bytes(sb));
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] bcachefs: Split brain detection
  2023-11-02  2:49 [PATCH 0/2] Split brain detection Kent Overstreet
  2023-11-02  2:49 ` [PATCH 1/2] bcachefs: bch_member->seq Kent Overstreet
@ 2023-11-02  2:49 ` Kent Overstreet
  1 sibling, 0 replies; 3+ messages in thread
From: Kent Overstreet @ 2023-11-02  2:49 UTC (permalink / raw)
  To: linux-bcachefs; +Cc: Kent Overstreet

Use the new bch_member->seq, sb->write_time fields to detect split brain
and kick out devices when necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h |  1 +
 fs/bcachefs/super.c   | 56 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 2a11f32cf30a..e1f733eeb7f0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -169,6 +169,7 @@
 	x(EINVAL,			device_size_too_small)			\
 	x(EINVAL,			device_not_a_member_of_filesystem)	\
 	x(EINVAL,			device_has_been_removed)		\
+	x(EINVAL,			device_splitbrain)			\
 	x(EINVAL,			device_already_online)			\
 	x(EINVAL,			insufficient_devices_to_start)		\
 	x(EINVAL,			invalid)				\
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 24672bb31cbe..e6b72ff06f56 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1012,20 +1012,46 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 	return 0;
 }
 
-static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+			  struct bch_sb_handle *sb)
 {
-	struct bch_sb *newest =
-		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+	if (fs == sb)
+		return 0;
 
-	if (!uuid_equal(&fs->uuid, &sb->uuid))
+	if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
 		return -BCH_ERR_device_not_a_member_of_filesystem;
 
-	if (!bch2_dev_exists(newest, sb->dev_idx))
+	if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
 		return -BCH_ERR_device_has_been_removed;
 
-	if (fs->block_size != sb->block_size)
+	if (fs->sb->block_size != sb->sb->block_size)
 		return -BCH_ERR_mismatched_block_size;
 
+	if (fs->sb->seq == sb->sb->seq &&
+	    fs->sb->write_time != sb->sb->write_time) {
+		pr_err("Split brain detected between %pg and %pg:\n"
+		       "seq (%llu) equal but write_time does not match\n"
+		       "Not using older sb %pg",
+		       sb->bdev, fs->bdev,
+		       le64_to_cpu(sb->sb->seq), sb->bdev);
+		return -BCH_ERR_device_splitbrain;
+	}
+
+	struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+	u64 seq_from_fs		= le64_to_cpu(m.seq);
+	u64 seq_from_member	= le64_to_cpu(sb->sb->seq);
+
+	if (seq_from_fs && seq_from_fs < seq_from_member) {
+		pr_err("Split brain detected between %pg and %pg:\n"
+		       "%pg believes seq of %pg to be %llu, but %pg has %llu\n"
+		       "Not using %pg",
+		       sb->bdev, fs->bdev,
+		       fs->bdev, sb->bdev, seq_from_fs,
+		       sb->bdev, seq_from_member,
+		       sb->bdev);
+		return -BCH_ERR_device_splitbrain;
+	}
+
 	return 0;
 }
 
@@ -1734,7 +1760,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	dev_idx = sb.sb->dev_idx;
 
-	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	ret = bch2_dev_in_fs(&c->disk_sb, &sb);
 	if (ret) {
 		bch_err_msg(c, ret, "bringing %s online", path);
 		goto err;
@@ -1882,6 +1908,12 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 
 /* Filesystem open: */
 
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+	return  cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+		cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 			    struct bch_opts opts)
 {
@@ -1914,19 +1946,21 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	}
 
 	darray_for_each(sbs, sb)
-		if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+		if (!best || sb_cmp(sb->sb, best->sb) > 0)
 			best = sb;
 
 	darray_for_each_reverse(sbs, sb) {
-		if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
-			pr_info("%pg has been removed, skipping", sb->bdev);
+		ret = bch2_dev_in_fs(best, sb);
+
+		if (ret == -BCH_ERR_device_has_been_removed ||
+		    ret == -BCH_ERR_device_splitbrain) {
 			bch2_free_super(sb);
 			darray_remove_item(&sbs, sb);
 			best -= best > sb;
+			ret = 0;
 			continue;
 		}
 
-		ret = bch2_dev_in_fs(best->sb, sb->sb);
 		if (ret)
 			goto err_print;
 	}
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-11-02  2:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-02  2:49 [PATCH 0/2] Split brain detection Kent Overstreet
2023-11-02  2:49 ` [PATCH 1/2] bcachefs: bch_member->seq Kent Overstreet
2023-11-02  2:49 ` [PATCH 2/2] bcachefs: Split brain detection Kent Overstreet

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.