[PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups

Linux cgroups development
 help / color / mirror / Atom feed

* [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups
@ 2026-06-24  6:46 Yu Kuai
  2026-06-24  6:46 ` [PATCH 2/2] ext4: add unraid mount option for single-disk-per-group mode Yu Kuai
                   ` (6 more replies)
  0 siblings, 7 replies; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Yu Kuai <yukuai@fnnas.com>

Add a module parameter 'fault_tolerant' that changes how md-linear
handles disk failures. When enabled:

- Disk failures are isolated instead of failing the entire array
- I/O to failed disks returns -EIO while healthy disks continue
- The array remains operational with reduced capacity
- Failed disk count is tracked and shown in /proc/mdstat

This enables unraid-like functionality where individual disk failures
don't bring down the entire array, allowing continued access to data
on healthy disks.

The fault_tolerant parameter can be set at module load time or
dynamically via /sys/module/md_linear/parameters/fault_tolerant.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md-linear.c | 63 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 8d7b82c4a723..8afc6665cfde 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -2,6 +2,10 @@
 /*
  * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
  * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
+ *
+ * Fault-tolerant mode added for unraid-like setups.
+ * When fault_tolerant=1, disk failures are isolated - I/O to failed disks
+ * returns -EIO while healthy disks continue operating normally.
  */

 #include <linux/blkdev.h>
@@ -21,9 +25,15 @@ struct linear_conf {
 	sector_t                array_sectors;
 	/* a copy of mddev->raid_disks */
 	int                     raid_disks;
+	atomic_t		failed_disks;	/* count of failed disks */
 	struct dev_info         disks[] __counted_by(raid_disks);
 };

+static bool fault_tolerant;
+module_param(fault_tolerant, bool, 0644);
+MODULE_PARM_DESC(fault_tolerant,
+	"Enable fault-tolerant mode: isolate disk failures instead of failing array (default: false)");
+
 /*
  * find which device holds a particular offset
  */
@@ -96,6 +106,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	if (!conf)
 		return ERR_PTR(-ENOMEM);

+	atomic_set(&conf->failed_disks, 0);
+
 	/*
 	 * conf->raid_disks is copy of mddev->raid_disks. The reason to
 	 * keep a copy of mddev->raid_disks in struct linear_conf is,
@@ -251,7 +263,8 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		     bio_sector < start_sector))
 		goto out_of_bounds;

-	if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+	if (unlikely(is_rdev_broken(tmp_dev->rdev) ||
+		     test_bit(Faulty, &tmp_dev->rdev->flags))) {
 		md_error(mddev, tmp_dev->rdev);
 		bio_io_error(bio);
 		return true;
@@ -296,16 +309,47 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)

 static void linear_status(struct seq_file *seq, struct mddev *mddev)
 {
+	struct linear_conf *conf = mddev->private;
+
 	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+	if (fault_tolerant) {
+		int failed = atomic_read(&conf->failed_disks);
+
+		seq_puts(seq, " fault-tolerant");
+		if (failed)
+			seq_printf(seq, " [%d failed]", failed);
+	}
 }

 static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
 {
-	if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
-		char *md_name = mdname(mddev);
-
-		pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
-			md_name, rdev->bdev);
+	char *md_name = mdname(mddev);
+
+	if (fault_tolerant) {
+		/*
+		 * Fault-tolerant mode: isolate the failed disk instead of
+		 * failing the entire array. I/O to this disk will return -EIO
+		 * but other disks continue operating normally.
+		 */
+		if (!test_and_set_bit(Faulty, &rdev->flags)) {
+			struct linear_conf *conf = mddev->private;
+
+			atomic_inc(&conf->failed_disks);
+			pr_warn("md/linear%s: Disk failure on %pg detected, isolating device (fault-tolerant mode).\n",
+				md_name, rdev->bdev);
+			pr_warn("md/linear%s: %d disk(s) now failed, array continues with reduced capacity.\n",
+				md_name, atomic_read(&conf->failed_disks));
+			/* Notify userspace about the state change */
+			sysfs_notify_dirent_safe(rdev->sysfs_state);
+		}
+	} else {
+		/*
+		 * Standard mode: fail the entire array on any disk failure.
+		 */
+		if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+			pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+				md_name, rdev->bdev);
+		}
 	}
 }

@@ -344,7 +388,7 @@ static void linear_exit(void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD with optional fault-tolerant mode");
 MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
 MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
--
2.43.0

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] ext4: add unraid mount option for single-disk-per-group mode
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
@ 2026-06-24  6:46 ` Yu Kuai
  2026-06-24  6:46 ` [PATCH v2 0/4] blk-cgroup: fix blkg list and policy data races Yu Kuai
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Yu Kuai <yukuai@fnnas.com>

Add support for an "unraid" mount option that enables a special mode
designed for use with fault-tolerant md-linear arrays. In this mode:

1. Variable block groups: Each block group can have a different size,
   allowing one physical disk per group. Lookup tables are used for
   block-to-group mapping instead of fixed-size calculations.

2. Distributed metadata: Every block group has its own superblock and
   group descriptor table copy, enabling the filesystem to remain
   accessible even if some disks fail.

3. Single-group allocation: Files are allocated entirely within a
   single block group. If a group doesn't have enough space, the
   allocation fails with -ENOSPC instead of trying other groups.
   This ensures each file resides on a single physical disk.

4. Inode locality: Inodes are allocated in the same group as their
   parent directory, keeping files and their metadata on the same disk.

This enables unraid-like functionality where:
- Each disk is independent and can be read separately
- Disk failures only affect files on that specific disk
- The filesystem continues operating with reduced capacity

Usage:
  mount -t ext4 -o unraid /dev/md0 /mnt

Note: This requires a specially formatted filesystem where each block
group corresponds to one physical disk. A future mkfs.ext4 extension
will support creating such filesystems.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 fs/ext4/balloc.c  | 45 ++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/ext4.h    | 15 ++++++++++++++-
 fs/ext4/ialloc.c  | 13 +++++++++++++
 fs/ext4/mballoc.c |  8 ++++++++
 fs/ext4/super.c   | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8040c731b3e4..bd151dc5480b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -54,17 +54,43 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
 {
-	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
 	ext4_grpblk_t offset;

 	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+
+	/* Unraid mode: binary search through variable-size groups */
+	if (sbi->s_group_first_block) {
+		ext4_group_t lo = 0, hi = sbi->s_groups_count - 1;
+		ext4_fsblk_t first_data = le32_to_cpu(es->s_first_data_block);
+
+		blocknr += first_data; /* restore original block number */
+
+		while (lo < hi) {
+			ext4_group_t mid = (lo + hi + 1) / 2;
+
+			if (blocknr < sbi->s_group_first_block[mid])
+				hi = mid - 1;
+			else
+				lo = mid;
+		}
+		if (blockgrpp)
+			*blockgrpp = lo;
+		if (offsetp) {
+			offset = (blocknr - sbi->s_group_first_block[lo]) >>
+				 sbi->s_cluster_bits;
+			*offsetp = offset;
+		}
+		return;
+	}
+
 	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
-		EXT4_SB(sb)->s_cluster_bits;
+		sbi->s_cluster_bits;
 	if (offsetp)
 		*offsetp = offset;
 	if (blockgrpp)
 		*blockgrpp = blocknr;
-
 }

 /*
@@ -162,8 +188,13 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
 static unsigned int num_clusters_in_group(struct super_block *sb,
 					  ext4_group_t block_group)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned int blocks;

+	/* Unraid mode: use per-group blocks count */
+	if (sbi->s_group_blocks_count)
+		return EXT4_NUM_B2C(sbi, sbi->s_group_blocks_count[block_group]);
+
 	if (block_group == ext4_get_groups_count(sb) - 1) {
 		/*
 		 * Even though mke2fs always initializes the first and
@@ -171,11 +202,11 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
 		 * we need to make sure we calculate the right free
 		 * blocks.
 		 */
-		blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+		blocks = ext4_blocks_count(sbi->s_es) -
 			ext4_group_first_block_no(sb, block_group);
 	} else
 		blocks = EXT4_BLOCKS_PER_GROUP(sb);
-	return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+	return EXT4_NUM_B2C(sbi, blocks);
 }

 /* Initializes an uninitialized block bitmap */
@@ -855,6 +886,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;

+	/*
+	 * Unraid mode: every group has a superblock copy for fault tolerance.
+	 * This allows mounting the filesystem even if some disks fail.
+	 */
+	if (test_opt2(sb, UNRAID))
+		return 1;
+
 	if (group == 0)
 		return 1;
 	if (ext4_has_feature_sparse_super2(sb)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56112f201cac..063e37a82654 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1295,6 +1295,9 @@ struct ext4_inode_info {
 						    * scanning in mballoc
 						    */
 #define EXT4_MOUNT2_ABORT		0x00000100 /* Abort filesystem */
+#define EXT4_MOUNT2_UNRAID		0x00000200 /* Unraid mode: one disk per
+						    * group, single-group alloc
+						    */

 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
@@ -1687,6 +1690,10 @@ struct ext4_sb_info {
 	struct flex_groups * __rcu *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;

+	/* Unraid mode: variable block groups (one disk per group) */
+	ext4_fsblk_t *s_group_first_block;	/* First block of each group */
+	ext4_grpblk_t *s_group_blocks_count;	/* Blocks count per group */
+
 	/* workqueue for reserved extent conversions (buffered io) */
 	struct workqueue_struct *rsv_conversion_wq;

@@ -2627,8 +2634,14 @@ struct dir_private_info {
 static inline ext4_fsblk_t
 ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/* Unraid mode: variable block groups, use lookup table */
+	if (sbi->s_group_first_block)
+		return sbi->s_group_first_block[group_no];
+
 	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
-		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+		le32_to_cpu(sbi->s_es->s_first_data_block);
 }

 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b20a1bf866ab..98fda602073e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -438,6 +438,19 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	int flex_size = ext4_flex_bg_size(sbi);
 	struct dx_hash_info hinfo;

+	/*
+	 * Unraid mode: always allocate inode in parent's group.
+	 * This ensures files and their inodes stay on the same disk.
+	 */
+	if (test_opt2(sb, UNRAID)) {
+		desc = ext4_get_group_desc(sb, parent_group, NULL);
+		if (desc && ext4_free_inodes_count(sb, desc) > 0) {
+			*group = parent_group;
+			return 0;
+		}
+		return -1; /* No free inodes in parent's group */
+	}
+
 	ngroups = real_ngroups;
 	if (flex_size > 1) {
 		ngroups = (real_ngroups + flex_size - 1) >>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 56d50fd3310b..9de674ec2f77 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2997,6 +2997,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	if (err || ac->ac_status == AC_STATUS_FOUND)
 		goto out;

+	/*
+	 * Unraid mode: files must be allocated entirely within a single group.
+	 * If the goal group doesn't have enough space, fail with -ENOSPC
+	 * instead of trying other groups.
+	 */
+	if (test_opt2(sb, UNRAID))
+		goto out;
+
 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
 		goto out;

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 87205660c5d0..9534a4ffbee7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1255,6 +1255,12 @@ static void ext4_group_desc_free(struct ext4_sb_info *sbi)
 		brelse(group_desc[i]);
 	kvfree(group_desc);
 	rcu_read_unlock();
+
+	/* Free unraid mode arrays */
+	kvfree(sbi->s_group_first_block);
+	kvfree(sbi->s_group_blocks_count);
+	sbi->s_group_first_block = NULL;
+	sbi->s_group_blocks_count = NULL;
 }

 static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
@@ -1677,6 +1683,7 @@ enum {
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
 	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
 	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
+	Opt_unraid,
 #ifdef CONFIG_EXT4_DEBUG
 	Opt_fc_debug_max_replay, Opt_fc_debug_force
 #endif
@@ -1819,6 +1826,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
 	fsparam_flag	("reservation",		Opt_removed),	/* mount option from ext2/3 */
 	fsparam_flag	("noreservation",	Opt_removed),	/* mount option from ext2/3 */
 	fsparam_u32	("journal",		Opt_removed),	/* mount option from ext2/3 */
+	fsparam_flag	("unraid",		Opt_unraid),
 	{}
 };

@@ -1912,6 +1920,7 @@ static const struct mount_opts {
 	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
 #endif
 	{Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
+	{Opt_unraid, EXT4_MOUNT2_UNRAID, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
 	{Opt_err, 0, 0}
 };

@@ -4845,6 +4854,65 @@ static int ext4_check_geometry(struct super_block *sb,
 	return 0;
 }

+/*
+ * Initialize unraid mode data structures.
+ * In unraid mode, each block group can have a different size (one disk per group).
+ * This function allocates and populates the lookup tables for variable-size groups.
+ *
+ * For now, this uses the standard fixed-size groups from the superblock.
+ * A future mkfs extension will store per-group sizes in the group descriptors.
+ */
+static int ext4_unraid_init(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_fsblk_t first_data_block;
+	ext4_group_t i;
+
+	if (!test_opt2(sb, UNRAID))
+		return 0;
+
+	sbi->s_group_first_block = kvmalloc_array(ngroups,
+						  sizeof(ext4_fsblk_t),
+						  GFP_KERNEL);
+	if (!sbi->s_group_first_block)
+		return -ENOMEM;
+
+	sbi->s_group_blocks_count = kvmalloc_array(ngroups,
+						   sizeof(ext4_grpblk_t),
+						   GFP_KERNEL);
+	if (!sbi->s_group_blocks_count) {
+		kvfree(sbi->s_group_first_block);
+		sbi->s_group_first_block = NULL;
+		return -ENOMEM;
+	}
+
+	/*
+	 * Initialize with standard fixed-size groups for now.
+	 * TODO: Read per-group sizes from extended group descriptors
+	 * when mkfs supports creating variable-size groups.
+	 */
+	first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	for (i = 0; i < ngroups; i++) {
+		sbi->s_group_first_block[i] = first_data_block +
+			(ext4_fsblk_t)i * EXT4_BLOCKS_PER_GROUP(sb);
+
+		if (i == ngroups - 1) {
+			/* Last group may be smaller */
+			sbi->s_group_blocks_count[i] =
+				ext4_blocks_count(sbi->s_es) -
+				sbi->s_group_first_block[i];
+		} else {
+			sbi->s_group_blocks_count[i] = EXT4_BLOCKS_PER_GROUP(sb);
+		}
+	}
+
+	ext4_msg(sb, KERN_INFO, "unraid mode enabled: %u groups",
+		 ngroups);
+
+	return 0;
+}
+
 static int ext4_group_desc_init(struct super_block *sb,
 				struct ext4_super_block *es,
 				ext4_fsblk_t logical_sb_block,
@@ -4904,7 +4972,8 @@ static int ext4_group_desc_init(struct super_block *sb,
 		return -EFSCORRUPTED;
 	}

-	return 0;
+	/* Initialize unraid mode data structures if enabled */
+	return ext4_unraid_init(sb);
 }

 static int ext4_load_and_init_journal(struct super_block *sb,
--
2.43.0

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 0/4] blk-cgroup: fix blkg list and policy data races
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
  2026-06-24  6:46 ` [PATCH 2/2] ext4: add unraid mount option for single-disk-per-group mode Yu Kuai
@ 2026-06-24  6:46 ` Yu Kuai
  2026-06-24  6:46 ` [PATCH v2 1/4] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex Yu Kuai
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Yu Kuai <yukuai@fygo.io>

Hi,

This series fixes races around q->blkg_list and blkg policy data
lifetime.

Patch 1 protects blkg_destroy_all()'s q->blkg_list walk with
blkcg_mutex.

Patches 2-3 fix races between blkcg_activate_policy() and concurrent
blkg destruction.

Patch 4 factors the policy data teardown loop into a helper after the
race fixes.

Changes since v1:
- Drop the BFQ q->blkg_list patch because the current block tree already
  has a stronger fix in commit 17b2d950a3c0 ("block, bfq: protect async
  queue reset with blkcg locks").
- Add Reviewed-by tags from Tang Yizhou.

Yu Kuai (1):
  blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with
    blkcg_mutex

Zheng Qixing (3):
  blk-cgroup: fix race between policy activation and blkg destruction
  blk-cgroup: skip dying blkg in blkcg_activate_policy()
  blk-cgroup: factor policy pd teardown loop into helper

 block/blk-cgroup.c | 65 +++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

-- 
2.51.0

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2 1/4] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
  2026-06-24  6:46 ` [PATCH 2/2] ext4: add unraid mount option for single-disk-per-group mode Yu Kuai
  2026-06-24  6:46 ` [PATCH v2 0/4] blk-cgroup: fix blkg list and policy data races Yu Kuai
@ 2026-06-24  6:46 ` Yu Kuai
  2026-06-24  6:46 ` [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Yu Kuai <yukuai@fygo.io>

blkg_destroy_all() iterates q->blkg_list without holding blkcg_mutex,
which can race with blkg_free_workfn() that removes blkgs from the list
while holding blkcg_mutex.

Add blkcg_mutex protection around the q->blkg_list iteration to prevent
potential list corruption or use-after-free issues.

Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ee076ab795d3..7baccfb690fe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -574,10 +574,11 @@ static void blkg_destroy_all(struct gendisk *disk)
 	struct blkcg_gq *blkg;
 	int count = BLKG_DESTROY_BATCH_SIZE;
 	int i;
 
 restart:
+	mutex_lock(&q->blkcg_mutex);
 	spin_lock_irq(&q->queue_lock);
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 
 		if (hlist_unhashed(&blkg->blkcg_node))
@@ -592,10 +593,11 @@ static void blkg_destroy_all(struct gendisk *disk)
 		 * it when a batch of blkgs are destroyed.
 		 */
 		if (!(--count)) {
 			count = BLKG_DESTROY_BATCH_SIZE;
 			spin_unlock_irq(&q->queue_lock);
+			mutex_unlock(&q->blkcg_mutex);
 			cond_resched();
 			goto restart;
 		}
 	}
 
@@ -611,10 +613,11 @@ static void blkg_destroy_all(struct gendisk *disk)
 			__clear_bit(pol->plid, q->blkcg_pols);
 	}
 
 	q->root_blkg = NULL;
 	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 
 	wake_up_var(&q->root_blkg);
 }
 
 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
                   ` (2 preceding siblings ...)
  2026-06-24  6:46 ` [PATCH v2 1/4] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex Yu Kuai
@ 2026-06-24  6:46 ` Yu Kuai
  2026-06-25 15:08   ` Nilay Shroff
  2026-06-24  6:46 ` [PATCH v2 3/4] blk-cgroup: skip dying blkg in blkcg_activate_policy() Yu Kuai
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Zheng Qixing <zhengqixing@huawei.com>

When switching an IO scheduler on a block device, blkcg_activate_policy()
allocates blkg_policy_data (pd) for all blkgs attached to the queue.
However, blkcg_activate_policy() may race with concurrent blkcg deletion,
leading to use-after-free and memory leak issues.

The use-after-free occurs in the following race:

T1 (blkcg_activate_policy):
  - Successfully allocates pd for blkg1 (loop0->queue, blkcgA)
  - Fails to allocate pd for blkg2 (loop0->queue, blkcgB)
  - Enters the enomem rollback path to release blkg1 resources

T2 (blkcg deletion):
  - blkcgA is deleted concurrently
  - blkg1 is freed via blkg_free_workfn()
  - blkg1->pd is freed

T1 (continued):
  - Rollback path accesses blkg1->pd->online after pd is freed
  - Triggers use-after-free

In addition, blkg_free_workfn() frees pd before removing the blkg from
q->blkg_list. This allows blkcg_activate_policy() to allocate a new pd
for a blkg that is being destroyed, leaving the newly allocated pd
unreachable when the blkg is finally freed.

Fix these races by extending blkcg_mutex coverage to serialize
blkcg_activate_policy() rollback and blkg destruction, ensuring pd
lifecycle is synchronized with blkg list visibility.

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7baccfb690fe..f7e788a7fe95 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
 		return -EINVAL;
 
 	if (queue_is_mq(q))
 		memflags = blk_mq_freeze_queue(q);
+
+	mutex_lock(&q->blkcg_mutex);
 retry:
 	spin_lock_irq(&q->queue_lock);
 
 	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
@@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
 
 	spin_unlock_irq(&q->queue_lock);
 out:
+	mutex_unlock(&q->blkcg_mutex);
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
 	if (pinned_blkg)
 		blkg_put(pinned_blkg);
 	if (pd_prealloc)
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-24  6:46 ` [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
@ 2026-06-25 15:08   ` Nilay Shroff
  2026-06-26  1:50     ` yu kuai
  0 siblings, 1 reply; 15+ messages in thread
From: Nilay Shroff @ 2026-06-25 15:08 UTC (permalink / raw)
  To: Yu Kuai, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel

On 6/24/26 12:16 PM, Yu Kuai wrote:
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 7baccfb690fe..f7e788a7fe95 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
>   	if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>   		return -EINVAL;
>   
>   	if (queue_is_mq(q))
>   		memflags = blk_mq_freeze_queue(q);
> +
> +	mutex_lock(&q->blkcg_mutex);
>   retry:
>   	spin_lock_irq(&q->queue_lock);
>   
>   	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
>   	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
>   	__set_bit(pol->plid, q->blkcg_pols);
>   	ret = 0;
>   
>   	spin_unlock_irq(&q->queue_lock);
>   out:
> +	mutex_unlock(&q->blkcg_mutex);
>   	if (queue_is_mq(q))
>   		blk_mq_unfreeze_queue(q, memflags);
>   	if (pinned_blkg)
>   		blkg_put(pinned_blkg);
>   	if (pd_prealloc)

If the policy allocation fails, we jump to the lable enomem: and teardown pds.
But I see this path still only acquires ->queue_lock. Don't we also need
to protect it with ->blkcg_mutex?

Moreover I still see race between blkg insertion in blkg_create() which
still doesn't use ->blkcg_mutex and so list traversal in bfq_end_wr_async()
may still race with blkg_create(), isn't it? I remember you once told
this will be handled in another series but I couldn't find that yet.

Thanks,
--Nilay




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-25 15:08   ` Nilay Shroff
@ 2026-06-26  1:50     ` yu kuai
  2026-06-26  1:52       ` yu kuai
  0 siblings, 1 reply; 15+ messages in thread
From: yu kuai @ 2026-06-26  1:50 UTC (permalink / raw)
  To: Nilay Shroff, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel, yukuai

Hi,

在 2026/6/25 23:08, Nilay Shroff 写道:
> On 6/24/26 12:16 PM, Yu Kuai wrote:
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 7baccfb690fe..f7e788a7fe95 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk 
>> *disk, const struct blkcg_policy *pol)
>>       if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>>           return -EINVAL;
>>         if (queue_is_mq(q))
>>           memflags = blk_mq_freeze_queue(q);
>> +
>> +    mutex_lock(&q->blkcg_mutex);
>>   retry:
>>       spin_lock_irq(&q->queue_lock);
>>         /* blkg_list is pushed at the head, reverse walk to 
>> initialize parents first */
>>       list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk 
>> *disk, const struct blkcg_policy *pol)
>>       __set_bit(pol->plid, q->blkcg_pols);
>>       ret = 0;
>>         spin_unlock_irq(&q->queue_lock);
>>   out:
>> +    mutex_unlock(&q->blkcg_mutex);
>>       if (queue_is_mq(q))
>>           blk_mq_unfreeze_queue(q, memflags);
>>       if (pinned_blkg)
>>           blkg_put(pinned_blkg);
>>       if (pd_prealloc)
>
> If the policy allocation fails, we jump to the lable enomem: and 
> teardown pds.
> But I see this path still only acquires ->queue_lock. Don't we also need
> to protect it with ->blkcg_mutex?

Yes, I agree we should protect it as well.

>
> Moreover I still see race between blkg insertion in blkg_create() which
> still doesn't use ->blkcg_mutex and so list traversal in 
> bfq_end_wr_async()
> may still race with blkg_create(), isn't it? I remember you once told
> this will be handled in another series but I couldn't find that yet.

This is the set:

[PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths - Yu 
Kuai <https://lore.kernel.org/all/cover.1780621988.git.yukuai@fygo.io/>

Noted that set just make sure queue_lock is not nested under other atomic
context, and that set do not acquire blkcg_mutex for blkg_create() yet. Howerver,
with that set it'll be easy to convert all queue_lock to blkcg_mtuex for blkg
protection, and together with lots of blk-cgroup code cleanups.


>
> Thanks,
> --Nilay
>
>
>
>
-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-26  1:50     ` yu kuai
@ 2026-06-26  1:52       ` yu kuai
  2026-06-26  6:12         ` Nilay Shroff
  0 siblings, 1 reply; 15+ messages in thread
From: yu kuai @ 2026-06-26  1:52 UTC (permalink / raw)
  To: Nilay Shroff, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel, yukuai

Hi,

在 2026/6/26 9:50, yu kuai 写道:
> Hi,
>
> 在 2026/6/25 23:08, Nilay Shroff 写道:
>> On 6/24/26 12:16 PM, Yu Kuai wrote:
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index 7baccfb690fe..f7e788a7fe95 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk
>>> *disk, const struct blkcg_policy *pol)
>>>        if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>>>            return -EINVAL;
>>>          if (queue_is_mq(q))
>>>            memflags = blk_mq_freeze_queue(q);
>>> +
>>> +    mutex_lock(&q->blkcg_mutex);
>>>    retry:
>>>        spin_lock_irq(&q->queue_lock);
>>>          /* blkg_list is pushed at the head, reverse walk to
>>> initialize parents first */
>>>        list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>>> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk
>>> *disk, const struct blkcg_policy *pol)
>>>        __set_bit(pol->plid, q->blkcg_pols);
>>>        ret = 0;
>>>          spin_unlock_irq(&q->queue_lock);
>>>    out:
>>> +    mutex_unlock(&q->blkcg_mutex);
>>>        if (queue_is_mq(q))
>>>            blk_mq_unfreeze_queue(q, memflags);
>>>        if (pinned_blkg)
>>>            blkg_put(pinned_blkg);
>>>        if (pd_prealloc)
>> If the policy allocation fails, we jump to the lable enomem: and
>> teardown pds.
>> But I see this path still only acquires ->queue_lock. Don't we also need
>> to protect it with ->blkcg_mutex?
> Yes, I agree we should protect it as well.

Just take a closer look at the code, the enomem is already protected by
blkcg_mutex :)

>
>> Moreover I still see race between blkg insertion in blkg_create() which
>> still doesn't use ->blkcg_mutex and so list traversal in
>> bfq_end_wr_async()
>> may still race with blkg_create(), isn't it? I remember you once told
>> this will be handled in another series but I couldn't find that yet.
> This is the set:
>
> [PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths - Yu
> Kuai <https://lore.kernel.org/all/cover.1780621988.git.yukuai@fygo.io/>
>
> Noted that set just make sure queue_lock is not nested under other atomic
> context, and that set do not acquire blkcg_mutex for blkg_create() yet. Howerver,
> with that set it'll be easy to convert all queue_lock to blkcg_mtuex for blkg
> protection, and together with lots of blk-cgroup code cleanups.
>
>
>> Thanks,
>> --Nilay
>>
>>
>>
>>
-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-26  1:52       ` yu kuai
@ 2026-06-26  6:12         ` Nilay Shroff
  2026-06-27  4:13           ` yu kuai
  0 siblings, 1 reply; 15+ messages in thread
From: Nilay Shroff @ 2026-06-26  6:12 UTC (permalink / raw)
  To: yukuai, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel

On 6/26/26 7:22 AM, yu kuai wrote:
> Hi,
> 
> 在 2026/6/26 9:50, yu kuai 写道:
>> Hi,
>>
>> 在 2026/6/25 23:08, Nilay Shroff 写道:
>>> On 6/24/26 12:16 PM, Yu Kuai wrote:
>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>> index 7baccfb690fe..f7e788a7fe95 100644
>>>> --- a/block/blk-cgroup.c
>>>> +++ b/block/blk-cgroup.c
>>>> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk
>>>> *disk, const struct blkcg_policy *pol)
>>>>         if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>>>>             return -EINVAL;
>>>>           if (queue_is_mq(q))
>>>>             memflags = blk_mq_freeze_queue(q);
>>>> +
>>>> +    mutex_lock(&q->blkcg_mutex);
>>>>     retry:
>>>>         spin_lock_irq(&q->queue_lock);
>>>>           /* blkg_list is pushed at the head, reverse walk to
>>>> initialize parents first */
>>>>         list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>>>> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk
>>>> *disk, const struct blkcg_policy *pol)
>>>>         __set_bit(pol->plid, q->blkcg_pols);
>>>>         ret = 0;
>>>>           spin_unlock_irq(&q->queue_lock);
>>>>     out:
>>>> +    mutex_unlock(&q->blkcg_mutex);
>>>>         if (queue_is_mq(q))
>>>>             blk_mq_unfreeze_queue(q, memflags);
>>>>         if (pinned_blkg)
>>>>             blkg_put(pinned_blkg);
>>>>         if (pd_prealloc)
>>> If the policy allocation fails, we jump to the lable enomem: and
>>> teardown pds.
>>> But I see this path still only acquires ->queue_lock. Don't we also need
>>> to protect it with ->blkcg_mutex?
>> Yes, I agree we should protect it as well.
> 
> Just take a closer look at the code, the enomem is already protected by
> blkcg_mutex :)
> 

Oh yes, but the ->blkcg_mutex is never released if we jump to enomem.
So that may potentially cause deadlock. We need to release ->blkcg_mutex
once blkcg_policy_teardown_pds() returns. Or may be refactor code (or add
comment) so that it's easy to realize or spot the ->blkcg_mutex is acquired
and then released around blkcg_policy_teardown_pds().

>>
>>> Moreover I still see race between blkg insertion in blkg_create() which
>>> still doesn't use ->blkcg_mutex and so list traversal in
>>> bfq_end_wr_async()
>>> may still race with blkg_create(), isn't it? I remember you once told
>>> this will be handled in another series but I couldn't find that yet.
>> This is the set:
>>
>> [PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths - Yu
>> Kuai <https://lore.kernel.org/all/cover.1780621988.git.yukuai@fygo.io/>
>>
>> Noted that set just make sure queue_lock is not nested under other atomic
>> context, and that set do not acquire blkcg_mutex for blkg_create() yet. Howerver,
>> with that set it'll be easy to convert all queue_lock to blkcg_mtuex for blkg
>> protection, and together with lots of blk-cgroup code cleanups.
>>

Okay, so are you planning to send a follow-up patchset that replaces ->queue_lock
with ->blkcg_mutex for protecting the blkg_list? If so, I'd still prefer acquiring
->blkcg_mutex around blkg_create() in this patchset. That would address the race
between blkg_create() and the blkg_list traversal in bfq_end_wr_async(), while the
subsequent series can focus on cleaning up and removing the remaining ->queue_lock
usage for blkg protection.

Thanks,
--Nilay


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-26  6:12         ` Nilay Shroff
@ 2026-06-27  4:13           ` yu kuai
  2026-06-29  5:33             ` Nilay Shroff
  0 siblings, 1 reply; 15+ messages in thread
From: yu kuai @ 2026-06-27  4:13 UTC (permalink / raw)
  To: Nilay Shroff, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel, yukuai

Hi,

在 2026/6/26 14:12, Nilay Shroff 写道:
> On 6/26/26 7:22 AM, yu kuai wrote:
>> Hi,
>>
>> 在 2026/6/26 9:50, yu kuai 写道:
>>> Hi,
>>>
>>> 在 2026/6/25 23:08, Nilay Shroff 写道:
>>>> On 6/24/26 12:16 PM, Yu Kuai wrote:
>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>>> index 7baccfb690fe..f7e788a7fe95 100644
>>>>> --- a/block/blk-cgroup.c
>>>>> +++ b/block/blk-cgroup.c
>>>>> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk
>>>>> *disk, const struct blkcg_policy *pol)
>>>>>         if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>>>>>             return -EINVAL;
>>>>>           if (queue_is_mq(q))
>>>>>             memflags = blk_mq_freeze_queue(q);
>>>>> +
>>>>> +    mutex_lock(&q->blkcg_mutex);
>>>>>     retry:
>>>>>         spin_lock_irq(&q->queue_lock);
>>>>>           /* blkg_list is pushed at the head, reverse walk to
>>>>> initialize parents first */
>>>>>         list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>>>>> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk
>>>>> *disk, const struct blkcg_policy *pol)
>>>>>         __set_bit(pol->plid, q->blkcg_pols);
>>>>>         ret = 0;
>>>>>           spin_unlock_irq(&q->queue_lock);
>>>>>     out:
>>>>> +    mutex_unlock(&q->blkcg_mutex);
>>>>>         if (queue_is_mq(q))
>>>>>             blk_mq_unfreeze_queue(q, memflags);
>>>>>         if (pinned_blkg)
>>>>>             blkg_put(pinned_blkg);
>>>>>         if (pd_prealloc)
>>>> If the policy allocation fails, we jump to the lable enomem: and
>>>> teardown pds.
>>>> But I see this path still only acquires ->queue_lock. Don't we also 
>>>> need
>>>> to protect it with ->blkcg_mutex?
>>> Yes, I agree we should protect it as well.
>>
>> Just take a closer look at the code, the enomem is already protected by
>> blkcg_mutex :)
>>
>
> Oh yes, but the ->blkcg_mutex is never released if we jump to enomem.
> So that may potentially cause deadlock. We need to release ->blkcg_mutex
> once blkcg_policy_teardown_pds() returns. Or may be refactor code (or add
> comment) so that it's easy to realize or spot the ->blkcg_mutex is 
> acquired
> and then released around blkcg_policy_teardown_pds().

the enomem will goto out at last, and blkcg_mutex do released. The code is
a bit hacky.

>
>>>
>>>> Moreover I still see race between blkg insertion in blkg_create() 
>>>> which
>>>> still doesn't use ->blkcg_mutex and so list traversal in
>>>> bfq_end_wr_async()
>>>> may still race with blkg_create(), isn't it? I remember you once told
>>>> this will be handled in another series but I couldn't find that yet.
>>> This is the set:
>>>
>>> [PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths - Yu
>>> Kuai <https://lore.kernel.org/all/cover.1780621988.git.yukuai@fygo.io/>
>>>
>>> Noted that set just make sure queue_lock is not nested under other 
>>> atomic
>>> context, and that set do not acquire blkcg_mutex for blkg_create() 
>>> yet. Howerver,
>>> with that set it'll be easy to convert all queue_lock to blkcg_mtuex 
>>> for blkg
>>> protection, and together with lots of blk-cgroup code cleanups.
>>>
>
> Okay, so are you planning to send a follow-up patchset that replaces 
> ->queue_lock
> with ->blkcg_mutex for protecting the blkg_list? If so, I'd still 
> prefer acquiring
> ->blkcg_mutex around blkg_create() in this patchset. That would 
> address the race
> between blkg_create() and the blkg_list traversal in 
> bfq_end_wr_async(), while the
> subsequent series can focus on cleaning up and removing the remaining 
> ->queue_lock
> usage for blkg protection.

Yes, there is a follow-up patchset. When this set was posted, blkg_create is still
called with queue_lock held, so I can't do that. However, not that the other set
is already applied, I can hold blkcg_mutex for blkg_create() now.

>
> Thanks,
> --Nilay
>
-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-27  4:13           ` yu kuai
@ 2026-06-29  5:33             ` Nilay Shroff
  2026-06-29  9:03               ` yu kuai
  0 siblings, 1 reply; 15+ messages in thread
From: Nilay Shroff @ 2026-06-29  5:33 UTC (permalink / raw)
  To: yukuai, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel

On 6/27/26 9:43 AM, yu kuai wrote:
> Hi,
> 
> 在 2026/6/26 14:12, Nilay Shroff 写道:
>> On 6/26/26 7:22 AM, yu kuai wrote:
>>> Hi,
>>>
>>> 在 2026/6/26 9:50, yu kuai 写道:
>>>> Hi,
>>>>
>>>> 在 2026/6/25 23:08, Nilay Shroff 写道:
>>>>> On 6/24/26 12:16 PM, Yu Kuai wrote:
>>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>>>> index 7baccfb690fe..f7e788a7fe95 100644
>>>>>> --- a/block/blk-cgroup.c
>>>>>> +++ b/block/blk-cgroup.c
>>>>>> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk
>>>>>> *disk, const struct blkcg_policy *pol)
>>>>>>          if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>>>>>>              return -EINVAL;
>>>>>>            if (queue_is_mq(q))
>>>>>>              memflags = blk_mq_freeze_queue(q);
>>>>>> +
>>>>>> +    mutex_lock(&q->blkcg_mutex);
>>>>>>      retry:
>>>>>>          spin_lock_irq(&q->queue_lock);
>>>>>>            /* blkg_list is pushed at the head, reverse walk to
>>>>>> initialize parents first */
>>>>>>          list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>>>>>> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk
>>>>>> *disk, const struct blkcg_policy *pol)
>>>>>>          __set_bit(pol->plid, q->blkcg_pols);
>>>>>>          ret = 0;
>>>>>>            spin_unlock_irq(&q->queue_lock);
>>>>>>      out:
>>>>>> +    mutex_unlock(&q->blkcg_mutex);
>>>>>>          if (queue_is_mq(q))
>>>>>>              blk_mq_unfreeze_queue(q, memflags);
>>>>>>          if (pinned_blkg)
>>>>>>              blkg_put(pinned_blkg);
>>>>>>          if (pd_prealloc)
>>>>> If the policy allocation fails, we jump to the lable enomem: and
>>>>> teardown pds.
>>>>> But I see this path still only acquires ->queue_lock. Don't we also
>>>>> need
>>>>> to protect it with ->blkcg_mutex?
>>>> Yes, I agree we should protect it as well.
>>>
>>> Just take a closer look at the code, the enomem is already protected by
>>> blkcg_mutex :)
>>>
>>
>> Oh yes, but the ->blkcg_mutex is never released if we jump to enomem.
>> So that may potentially cause deadlock. We need to release ->blkcg_mutex
>> once blkcg_policy_teardown_pds() returns. Or may be refactor code (or add
>> comment) so that it's easy to realize or spot the ->blkcg_mutex is
>> acquired
>> and then released around blkcg_policy_teardown_pds().
> 
> the enomem will goto out at last, and blkcg_mutex do released. The code is
> a bit hacky.
> 
>>
>>>>
>>>>> Moreover I still see race between blkg insertion in blkg_create()
>>>>> which
>>>>> still doesn't use ->blkcg_mutex and so list traversal in
>>>>> bfq_end_wr_async()
>>>>> may still race with blkg_create(), isn't it? I remember you once told
>>>>> this will be handled in another series but I couldn't find that yet.
>>>> This is the set:
>>>>
>>>> [PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths - Yu
>>>> Kuai <https://lore.kernel.org/all/cover.1780621988.git.yukuai@fygo.io/>
>>>>
>>>> Noted that set just make sure queue_lock is not nested under other
>>>> atomic
>>>> context, and that set do not acquire blkcg_mutex for blkg_create()
>>>> yet. Howerver,
>>>> with that set it'll be easy to convert all queue_lock to blkcg_mtuex
>>>> for blkg
>>>> protection, and together with lots of blk-cgroup code cleanups.
>>>>
>>
>> Okay, so are you planning to send a follow-up patchset that replaces
>> ->queue_lock
>> with ->blkcg_mutex for protecting the blkg_list? If so, I'd still
>> prefer acquiring
>> ->blkcg_mutex around blkg_create() in this patchset. That would
>> address the race
>> between blkg_create() and the blkg_list traversal in
>> bfq_end_wr_async(), while the
>> subsequent series can focus on cleaning up and removing the remaining
>> ->queue_lock
>> usage for blkg protection.
> 
> Yes, there is a follow-up patchset. When this set was posted, blkg_create is still
> called with queue_lock held, so I can't do that. However, not that the other set
> is already applied, I can hold blkcg_mutex for blkg_create() now.
> 

If you already have a follow-up patchset that replaces ->queue_lock with
->blkcg_mutex for protecting blkg_list, then I think it might make sense
to send that out first (if you haven't already) and hold off this series for
now. That way, the blkg_create() race can be addressed first, and this
series can build on top of those changes.

BTW, if that follow-up series is merged first, I suspect the first two patches in
this series may no longer be necessary, leaving only patches 3/4 and 4/4.

Thanks,
--Nilay



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction
  2026-06-29  5:33             ` Nilay Shroff
@ 2026-06-29  9:03               ` yu kuai
  0 siblings, 0 replies; 15+ messages in thread
From: yu kuai @ 2026-06-29  9:03 UTC (permalink / raw)
  To: Nilay Shroff, yukuai, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Ming Lei, cgroups,
	linux-block, linux-kernel

Hi,

在 2026/6/29 13:33, Nilay Shroff 写道:
> On 6/27/26 9:43 AM, yu kuai wrote:
>> Hi,
>>
>> 在 2026/6/26 14:12, Nilay Shroff 写道:
>>> On 6/26/26 7:22 AM, yu kuai wrote:
>>>> Hi,
>>>>
>>>> 在 2026/6/26 9:50, yu kuai 写道:
>>>>> Hi,
>>>>>
>>>>> 在 2026/6/25 23:08, Nilay Shroff 写道:
>>>>>> On 6/24/26 12:16 PM, Yu Kuai wrote:
>>>>>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>>>>>> index 7baccfb690fe..f7e788a7fe95 100644
>>>>>>> --- a/block/blk-cgroup.c
>>>>>>> +++ b/block/blk-cgroup.c
>>>>>>> @@ -1563,10 +1563,12 @@ int blkcg_activate_policy(struct gendisk
>>>>>>> *disk, const struct blkcg_policy *pol)
>>>>>>>          if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
>>>>>>>              return -EINVAL;
>>>>>>>            if (queue_is_mq(q))
>>>>>>>              memflags = blk_mq_freeze_queue(q);
>>>>>>> +
>>>>>>> +    mutex_lock(&q->blkcg_mutex);
>>>>>>>      retry:
>>>>>>>          spin_lock_irq(&q->queue_lock);
>>>>>>>            /* blkg_list is pushed at the head, reverse walk to
>>>>>>> initialize parents first */
>>>>>>>          list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
>>>>>>> @@ -1625,10 +1627,11 @@ int blkcg_activate_policy(struct gendisk
>>>>>>> *disk, const struct blkcg_policy *pol)
>>>>>>>          __set_bit(pol->plid, q->blkcg_pols);
>>>>>>>          ret = 0;
>>>>>>>            spin_unlock_irq(&q->queue_lock);
>>>>>>>      out:
>>>>>>> +    mutex_unlock(&q->blkcg_mutex);
>>>>>>>          if (queue_is_mq(q))
>>>>>>>              blk_mq_unfreeze_queue(q, memflags);
>>>>>>>          if (pinned_blkg)
>>>>>>>              blkg_put(pinned_blkg);
>>>>>>>          if (pd_prealloc)
>>>>>> If the policy allocation fails, we jump to the lable enomem: and
>>>>>> teardown pds.
>>>>>> But I see this path still only acquires ->queue_lock. Don't we also
>>>>>> need
>>>>>> to protect it with ->blkcg_mutex?
>>>>> Yes, I agree we should protect it as well.
>>>>
>>>> Just take a closer look at the code, the enomem is already 
>>>> protected by
>>>> blkcg_mutex :)
>>>>
>>>
>>> Oh yes, but the ->blkcg_mutex is never released if we jump to enomem.
>>> So that may potentially cause deadlock. We need to release 
>>> ->blkcg_mutex
>>> once blkcg_policy_teardown_pds() returns. Or may be refactor code 
>>> (or add
>>> comment) so that it's easy to realize or spot the ->blkcg_mutex is
>>> acquired
>>> and then released around blkcg_policy_teardown_pds().
>>
>> the enomem will goto out at last, and blkcg_mutex do released. The 
>> code is
>> a bit hacky.
>>
>>>
>>>>>
>>>>>> Moreover I still see race between blkg insertion in blkg_create()
>>>>>> which
>>>>>> still doesn't use ->blkcg_mutex and so list traversal in
>>>>>> bfq_end_wr_async()
>>>>>> may still race with blkg_create(), isn't it? I remember you once 
>>>>>> told
>>>>>> this will be handled in another series but I couldn't find that yet.
>>>>> This is the set:
>>>>>
>>>>> [PATCH 0/8] blk-cgroup: remove queue_lock nesting from blkcg paths 
>>>>> - Yu
>>>>> Kuai 
>>>>> <https://lore.kernel.org/all/cover.1780621988.git.yukuai@fygo.io/>
>>>>>
>>>>> Noted that set just make sure queue_lock is not nested under other
>>>>> atomic
>>>>> context, and that set do not acquire blkcg_mutex for blkg_create()
>>>>> yet. Howerver,
>>>>> with that set it'll be easy to convert all queue_lock to blkcg_mtuex
>>>>> for blkg
>>>>> protection, and together with lots of blk-cgroup code cleanups.
>>>>>
>>>
>>> Okay, so are you planning to send a follow-up patchset that replaces
>>> ->queue_lock
>>> with ->blkcg_mutex for protecting the blkg_list? If so, I'd still
>>> prefer acquiring
>>> ->blkcg_mutex around blkg_create() in this patchset. That would
>>> address the race
>>> between blkg_create() and the blkg_list traversal in
>>> bfq_end_wr_async(), while the
>>> subsequent series can focus on cleaning up and removing the remaining
>>> ->queue_lock
>>> usage for blkg protection.
>>
>> Yes, there is a follow-up patchset. When this set was posted, 
>> blkg_create is still
>> called with queue_lock held, so I can't do that. However, not that 
>> the other set
>> is already applied, I can hold blkcg_mutex for blkg_create() now.
>>
>
> If you already have a follow-up patchset that replaces ->queue_lock with
> ->blkcg_mutex for protecting blkg_list, then I think it might make sense
> to send that out first (if you haven't already) and hold off this 
> series for
> now. That way, the blkg_create() race can be addressed first, and this
> series can build on top of those changes.

Yes, I already have a follow-up patchset that is build after this set.
I agree send that first will make sense, I'll rebase and send soon.

>
> BTW, if that follow-up series is merged first, I suspect the first two 
> patches in
> this series may no longer be necessary, leaving only patches 3/4 and 4/4.

Yes, the first two patches will be folded into the patch to replace
queue_lock with blkcg_mutex.

>
> Thanks,
> --Nilay
>
>
>
-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2 3/4] blk-cgroup: skip dying blkg in blkcg_activate_policy()
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
                   ` (3 preceding siblings ...)
  2026-06-24  6:46 ` [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
@ 2026-06-24  6:46 ` Yu Kuai
  2026-06-24  6:46 ` [PATCH v2 4/4] blk-cgroup: factor policy pd teardown loop into helper Yu Kuai
  2026-06-24  6:55 ` [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups yu kuai
  6 siblings, 0 replies; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Zheng Qixing <zhengqixing@huawei.com>

When switching IO schedulers on a block device, blkcg_activate_policy()
can race with concurrent blkcg deletion, leading to a use-after-free in
rcu_accelerate_cbs.

T1:                               T2:
                                  blkg_destroy
                                  kill(&blkg->refcnt) // blkg->refcnt=1->0
                                  blkg_release // call_rcu(__blkg_release)
                                  ...
                                  blkg_free_workfn
                                  ->pd_free_fn(pd)
elv_iosched_store
elevator_switch
...
iterate blkg list
blkg_get(blkg) // blkg->refcnt=0->1
                                  list_del_init(&blkg->q_node)
blkg_put(pinned_blkg) // blkg->refcnt=1->0
blkg_release // call_rcu again
rcu_accelerate_cbs // uaf

Fix this by checking hlist_unhashed(&blkg->blkcg_node) before getting
a reference to the blkg. This is the same check used in blkg_destroy()
to detect if a blkg has already been destroyed. If the blkg is already
unhashed, skip processing it since it's being destroyed.

Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()")
Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f7e788a7fe95..2538d8105e6c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1574,10 +1574,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
 		struct blkg_policy_data *pd;
 
 		if (blkg->pd[pol->plid])
 			continue;
+		if (hlist_unhashed(&blkg->blkcg_node))
+			continue;
 
 		/* If prealloc matches, use it; otherwise try GFP_NOWAIT */
 		if (blkg == pinned_blkg) {
 			pd = pd_prealloc;
 			pd_prealloc = NULL;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 4/4] blk-cgroup: factor policy pd teardown loop into helper
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
                   ` (4 preceding siblings ...)
  2026-06-24  6:46 ` [PATCH v2 3/4] blk-cgroup: skip dying blkg in blkcg_activate_policy() Yu Kuai
@ 2026-06-24  6:46 ` Yu Kuai
  2026-06-24  6:55 ` [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups yu kuai
  6 siblings, 0 replies; 15+ messages in thread
From: Yu Kuai @ 2026-06-24  6:46 UTC (permalink / raw)
  To: Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel

From: Zheng Qixing <zhengqixing@huawei.com>

Move the teardown sequence which offlines and frees per-policy
blkg_policy_data (pd) into a helper for readability.

No functional change intended.

Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 57 ++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2538d8105e6c..e5e95be4fbc0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1526,10 +1526,35 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.depends_on = 1 << memory_cgrp_id,
 #endif
 };
 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
+/*
+ * Tear down per-blkg policy data for @pol on @q.
+ */
+static void blkcg_policy_teardown_pds(struct request_queue *q,
+				      const struct blkcg_policy *pol)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		struct blkcg *blkcg = blkg->blkcg;
+		struct blkg_policy_data *pd;
+
+		spin_lock(&blkcg->lock);
+		pd = blkg->pd[pol->plid];
+		if (pd) {
+			if (pd->online && pol->pd_offline_fn)
+				pol->pd_offline_fn(pd);
+			pd->online = false;
+			pol->pd_free_fn(pd);
+			blkg->pd[pol->plid] = NULL;
+		}
+		spin_unlock(&blkcg->lock);
+	}
+}
+
 /**
  * blkcg_activate_policy - activate a blkcg policy on a gendisk
  * @disk: gendisk of interest
  * @pol: blkcg policy to activate
  *
@@ -1641,25 +1666,11 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	return ret;
 
 enomem:
 	/* alloc failed, take down everything */
 	spin_lock_irq(&q->queue_lock);
-	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		struct blkcg *blkcg = blkg->blkcg;
-		struct blkg_policy_data *pd;
-
-		spin_lock(&blkcg->lock);
-		pd = blkg->pd[pol->plid];
-		if (pd) {
-			if (pd->online && pol->pd_offline_fn)
-				pol->pd_offline_fn(pd);
-			pd->online = false;
-			pol->pd_free_fn(pd);
-			blkg->pd[pol->plid] = NULL;
-		}
-		spin_unlock(&blkcg->lock);
-	}
+	blkcg_policy_teardown_pds(q, pol);
 	spin_unlock_irq(&q->queue_lock);
 	ret = -ENOMEM;
 	goto out;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1674,11 +1685,10 @@ EXPORT_SYMBOL_GPL(blkcg_activate_policy);
  */
 void blkcg_deactivate_policy(struct gendisk *disk,
 			     const struct blkcg_policy *pol)
 {
 	struct request_queue *q = disk->queue;
-	struct blkcg_gq *blkg;
 	unsigned int memflags;
 
 	if (!blkcg_policy_enabled(q, pol))
 		return;
 
@@ -1687,24 +1697,11 @@ void blkcg_deactivate_policy(struct gendisk *disk,
 
 	mutex_lock(&q->blkcg_mutex);
 	spin_lock_irq(&q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
-
-	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		struct blkcg *blkcg = blkg->blkcg;
-
-		spin_lock(&blkcg->lock);
-		if (blkg->pd[pol->plid]) {
-			if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
-				pol->pd_offline_fn(blkg->pd[pol->plid]);
-			pol->pd_free_fn(blkg->pd[pol->plid]);
-			blkg->pd[pol->plid] = NULL;
-		}
-		spin_unlock(&blkcg->lock);
-	}
-
+	blkcg_policy_teardown_pds(q, pol);
 	spin_unlock_irq(&q->queue_lock);
 	mutex_unlock(&q->blkcg_mutex);
 
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups
  2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
                   ` (5 preceding siblings ...)
  2026-06-24  6:46 ` [PATCH v2 4/4] blk-cgroup: factor policy pd teardown loop into helper Yu Kuai
@ 2026-06-24  6:55 ` yu kuai
  6 siblings, 0 replies; 15+ messages in thread
From: yu kuai @ 2026-06-24  6:55 UTC (permalink / raw)
  To: Yu Kuai, Tejun Heo, Josef Bacik, Jens Axboe
  Cc: Zheng Qixing, Christoph Hellwig, Tang Yizhou, Nilay Shroff,
	Ming Lei, cgroups, linux-block, linux-kernel, yukuai

Hi,

Please ignore this patch, this patch is supposed only used downstream.
Ai somehow generate the cmd to send it together with the patchset:

blk-cgroup: fix blkg list and policy data races

Same for the other ext4 patch.

Sorry for the noise. :(

在 2026/6/24 14:46, Yu Kuai 写道:
> From: Yu Kuai<yukuai@fnnas.com>
>
> Add a module parameter 'fault_tolerant' that changes how md-linear
> handles disk failures. When enabled:
>
> - Disk failures are isolated instead of failing the entire array
> - I/O to failed disks returns -EIO while healthy disks continue
> - The array remains operational with reduced capacity
> - Failed disk count is tracked and shown in /proc/mdstat
>
> This enables unraid-like functionality where individual disk failures
> don't bring down the entire array, allowing continued access to data
> on healthy disks.
>
> The fault_tolerant parameter can be set at module load time or
> dynamically via /sys/module/md_linear/parameters/fault_tolerant.
>
> Signed-off-by: Yu Kuai<yukuai@fnnas.com>
> ---
>   drivers/md/md-linear.c | 63 ++++++++++++++++++++++++++++++++++++------
>   1 file changed, 55 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c

-- 
Thanks,
Kuai

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2026-06-29  9:03 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-24  6:46 [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups Yu Kuai
2026-06-24  6:46 ` [PATCH 2/2] ext4: add unraid mount option for single-disk-per-group mode Yu Kuai
2026-06-24  6:46 ` [PATCH v2 0/4] blk-cgroup: fix blkg list and policy data races Yu Kuai
2026-06-24  6:46 ` [PATCH v2 1/4] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex Yu Kuai
2026-06-24  6:46 ` [PATCH v2 2/4] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
2026-06-25 15:08   ` Nilay Shroff
2026-06-26  1:50     ` yu kuai
2026-06-26  1:52       ` yu kuai
2026-06-26  6:12         ` Nilay Shroff
2026-06-27  4:13           ` yu kuai
2026-06-29  5:33             ` Nilay Shroff
2026-06-29  9:03               ` yu kuai
2026-06-24  6:46 ` [PATCH v2 3/4] blk-cgroup: skip dying blkg in blkcg_activate_policy() Yu Kuai
2026-06-24  6:46 ` [PATCH v2 4/4] blk-cgroup: factor policy pd teardown loop into helper Yu Kuai
2026-06-24  6:55 ` [PATCH 1/2] md/linear: add fault-tolerant mode for unraid-like setups yu kuai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox