* [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer
@ 2024-12-18 12:17 yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() yukuai
` (4 more replies)
0 siblings, 5 replies; 14+ messages in thread
From: yukuai @ 2024-12-18 12:17 UTC (permalink / raw)
To: song, yukuai3; +Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, Yu Kuai
From: Yu Kuai <yukuai@kernel.org>
Changes in v2:
- add review tag for patch 1 and 3;
- update commit message for patch 5;
Yu Kuai (5):
md/md-bitmap: factor behind write counters out from
bitmap_{start/end}write()
md/md-bitmap: remove the last parameter for bimtap_ops->endwrite()
md: add a new callback pers->bitmap_sector()
md/raid5: implement pers->bitmap_sector()
md/md-bitmap: move bitmap_{start, end}write to md upper layer
drivers/md/md-bitmap.c | 74 ++++++++++++++++++++++++----------------
drivers/md/md-bitmap.h | 7 ++--
drivers/md/md.c | 29 ++++++++++++++++
drivers/md/md.h | 5 +++
drivers/md/raid1.c | 11 +++---
drivers/md/raid10.c | 6 ----
drivers/md/raid5-cache.c | 4 ---
drivers/md/raid5.c | 48 ++++++++++++--------------
8 files changed, 109 insertions(+), 75 deletions(-)
--
2.43.0
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write()
2024-12-18 12:17 [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
@ 2024-12-18 12:17 ` yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite() yukuai
` (3 subsequent siblings)
4 siblings, 0 replies; 14+ messages in thread
From: yukuai @ 2024-12-18 12:17 UTC (permalink / raw)
To: song, yukuai3
Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, Xiao Ni, Yu Kuai
From: Yu Kuai <yukuai3@huawei.com>
behind_write is only used in raid1, prepare to refactor
bitmap_{start/end}write(), there are no functional changes.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@kernel.org>
---
drivers/md/md-bitmap.c | 57 +++++++++++++++++++++++++---------------
drivers/md/md-bitmap.h | 7 +++--
drivers/md/raid1.c | 12 +++++----
drivers/md/raid10.c | 6 ++---
drivers/md/raid5-cache.c | 3 +--
drivers/md/raid5.c | 11 ++++----
6 files changed, 56 insertions(+), 40 deletions(-)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c3a42dd66ce5..84fb4cc67d5e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1671,24 +1671,13 @@ __acquires(bitmap->lock)
}
static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind)
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
- if (behind) {
- int bw;
- atomic_inc(&bitmap->behind_writes);
- bw = atomic_read(&bitmap->behind_writes);
- if (bw > bitmap->behind_writes_used)
- bitmap->behind_writes_used = bw;
-
- pr_debug("inc write-behind count %d/%lu\n",
- bw, bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
bitmap_counter_t *bmc;
@@ -1737,21 +1726,13 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
}
static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind)
+ unsigned long sectors, bool success)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
- if (behind) {
- if (atomic_dec_and_test(&bitmap->behind_writes))
- wake_up(&bitmap->behind_wait);
- pr_debug("dec write-behind count %d/%lu\n",
- atomic_read(&bitmap->behind_writes),
- bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
unsigned long flags;
@@ -2062,6 +2043,37 @@ static void md_bitmap_free(void *data)
kfree(bitmap);
}
+static void bitmap_start_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ int bw;
+
+ if (!bitmap)
+ return;
+
+ atomic_inc(&bitmap->behind_writes);
+ bw = atomic_read(&bitmap->behind_writes);
+ if (bw > bitmap->behind_writes_used)
+ bitmap->behind_writes_used = bw;
+
+ pr_debug("inc write-behind count %d/%lu\n",
+ bw, bitmap->mddev->bitmap_info.max_write_behind);
+}
+
+static void bitmap_end_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
+
+ if (atomic_dec_and_test(&bitmap->behind_writes))
+ wake_up(&bitmap->behind_wait);
+ pr_debug("dec write-behind count %d/%lu\n",
+ atomic_read(&bitmap->behind_writes),
+ bitmap->mddev->bitmap_info.max_write_behind);
+}
+
static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -2981,6 +2993,9 @@ static struct bitmap_operations bitmap_ops = {
.dirty_bits = bitmap_dirty_bits,
.unplug = bitmap_unplug,
.daemon_work = bitmap_daemon_work,
+
+ .start_behind_write = bitmap_start_behind_write,
+ .end_behind_write = bitmap_end_behind_write,
.wait_behind_writes = bitmap_wait_behind_writes,
.startwrite = bitmap_startwrite,
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 662e6fc141a7..e87a1f493d3c 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -84,12 +84,15 @@ struct bitmap_operations {
unsigned long e);
void (*unplug)(struct mddev *mddev, bool sync);
void (*daemon_work)(struct mddev *mddev);
+
+ void (*start_behind_write)(struct mddev *mddev);
+ void (*end_behind_write)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
int (*startwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind);
+ unsigned long sectors);
void (*endwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind);
+ unsigned long sectors, bool success);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 519c56f0ee3d..15ba7a001f30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -420,10 +420,11 @@ static void close_write(struct r1bio *r1_bio)
r1_bio->behind_master_bio = NULL;
}
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->end_behind_write(mddev);
/* clear the bitmap if all writes complete successfully */
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ !test_bit(R1BIO_Degraded, &r1_bio->state));
md_write_end(mddev);
}
@@ -1645,9 +1646,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
- mddev->bitmap_ops->startwrite(
- mddev, r1_bio->sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
+ mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
+ r1_bio->sectors);
first_clone = 0;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7d7a8a2524dc..c3a93b2a26a6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -430,8 +430,7 @@ static void close_write(struct r10bio *r10_bio)
/* clear the bitmap if all writes complete successfully */
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- false);
+ !test_bit(R10BIO_Degraded, &r10_bio->state));
md_write_end(mddev);
}
@@ -1519,8 +1518,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
- false);
+ mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b4f7b79fd187..4c7ecdd5c1f3 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -315,8 +315,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
+ !test_bit(STRIPE_DEGRADED, &sh->state));
}
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f09e7677ee9f..93cc7e252dd4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3564,7 +3564,7 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
+ RAID5_STRIPE_SECTORS(conf));
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@@ -3665,7 +3665,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
+ false);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3712,7 +3712,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
+ false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4063,8 +4063,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
}
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
+ !test_bit(STRIPE_DEGRADED, &sh->state));
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -5787,7 +5786,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
for (d = 0; d < conf->raid_disks - conf->max_degraded;
d++)
mddev->bitmap_ops->startwrite(mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
+ RAID5_STRIPE_SECTORS(conf));
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite()
2024-12-18 12:17 [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() yukuai
@ 2024-12-18 12:17 ` yukuai
2024-12-23 7:31 ` Xiao Ni
2024-12-18 12:17 ` [PATCH v2 md-6.14 3/5] md: add a new callback pers->bitmap_sector() yukuai
` (2 subsequent siblings)
4 siblings, 1 reply; 14+ messages in thread
From: yukuai @ 2024-12-18 12:17 UTC (permalink / raw)
To: song, yukuai3; +Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, Yu Kuai
From: Yu Kuai <yukuai3@huawei.com>
It is useless, because for the case IO failed for one rdev:
- If badblocks is set and rdev is not faulty, there is no need to
mark the bit as NEEDED;
- If rdev is faulty, then mddev->degraded will be set, and we can use
it to mard the bit as NEEDED;
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Yu Kuai <yukuai@kernel.org>
---
drivers/md/md-bitmap.c | 19 ++++++++++---------
drivers/md/md-bitmap.h | 2 +-
drivers/md/raid1.c | 3 +--
drivers/md/raid10.c | 3 +--
drivers/md/raid5-cache.c | 3 +--
drivers/md/raid5.c | 9 +++------
6 files changed, 17 insertions(+), 22 deletions(-)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 84fb4cc67d5e..b40a84b01085 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1726,7 +1726,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
}
static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success)
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -1745,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
return;
}
- if (success && !bitmap->mddev->degraded &&
- bitmap->events_cleared < bitmap->mddev->events) {
- bitmap->events_cleared = bitmap->mddev->events;
- bitmap->need_sync = 1;
- sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
- }
-
- if (!success && !NEEDED(*bmc))
+ if (!bitmap->mddev->degraded) {
+ if (bitmap->events_cleared < bitmap->mddev->events) {
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->need_sync = 1;
+ sysfs_notify_dirent_safe(
+ bitmap->sysfs_can_clear);
+ }
+ } else if (!NEEDED(*bmc)) {
*bmc |= NEEDED_MASK;
+ }
if (COUNTER(*bmc) == COUNTER_MAX)
wake_up(&bitmap->overflow_wait);
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index e87a1f493d3c..31c93019c76b 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -92,7 +92,7 @@ struct bitmap_operations {
int (*startwrite)(struct mddev *mddev, sector_t offset,
unsigned long sectors);
void (*endwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success);
+ unsigned long sectors);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 15ba7a001f30..81dff2cea0db 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -423,8 +423,7 @@ static void close_write(struct r1bio *r1_bio)
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->end_behind_write(mddev);
/* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state));
+ mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
md_write_end(mddev);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c3a93b2a26a6..3dc0170125b2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -429,8 +429,7 @@ static void close_write(struct r10bio *r10_bio)
struct mddev *mddev = r10_bio->mddev;
/* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state));
+ mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
md_write_end(mddev);
}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 4c7ecdd5c1f3..ba4f9577c737 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -314,8 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state));
+ sh->sector, RAID5_STRIPE_SECTORS(conf));
}
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 93cc7e252dd4..6eb2841ce28c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3664,8 +3664,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- false);
+ sh->sector, RAID5_STRIPE_SECTORS(conf));
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3711,8 +3710,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- false);
+ sh->sector, RAID5_STRIPE_SECTORS(conf));
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4062,8 +4060,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
wbi = wbi2;
}
conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state));
+ sh->sector, RAID5_STRIPE_SECTORS(conf));
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 md-6.14 3/5] md: add a new callback pers->bitmap_sector()
2024-12-18 12:17 [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite() yukuai
@ 2024-12-18 12:17 ` yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector() yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
4 siblings, 0 replies; 14+ messages in thread
From: yukuai @ 2024-12-18 12:17 UTC (permalink / raw)
To: song, yukuai3
Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, Xiao Ni, Yu Kuai
From: Yu Kuai <yukuai3@huawei.com>
This callback will be used in raid5 to convert io ranges from array to
bitmap.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Yu Kuai <yukuai@kernel.org>
---
drivers/md/md.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4ba93af36126..de6dadb9a40b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -746,6 +746,9 @@ struct md_personality
void *(*takeover) (struct mddev *mddev);
/* Changes the consistency policy of an active array. */
int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
+ /* convert io ranges from array to bitmap */
+ void (*bitmap_sector)(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors);
};
struct md_sysfs_entry {
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector()
2024-12-18 12:17 [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
` (2 preceding siblings ...)
2024-12-18 12:17 ` [PATCH v2 md-6.14 3/5] md: add a new callback pers->bitmap_sector() yukuai
@ 2024-12-18 12:17 ` yukuai
2025-01-01 5:36 ` Xiao Ni
2024-12-18 12:17 ` [PATCH v2 md-6.14 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
4 siblings, 1 reply; 14+ messages in thread
From: yukuai @ 2024-12-18 12:17 UTC (permalink / raw)
To: song, yukuai3; +Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, Yu Kuai
From: Yu Kuai <yukuai3@huawei.com>
Bitmap is used for the whole array for raid1/raid10, hence IO for the
array can be used directly for bitmap. However, bitmap is used for
underlying disks for raid5, hence IO for the array can't be used
directly for bitmap.
Implement pers->bitmap_sector() for raid5 to convert IO ranges from the
array to the underlying disks.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Yu Kuai <yukuai@kernel.org>
---
drivers/md/raid5.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6eb2841ce28c..b2fe201b599d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2950,6 +2950,23 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
r5c_update_on_rdev_error(mddev, rdev);
}
+static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors)
+{
+ struct r5conf *conf = mddev->private;
+ int sectors_per_chunk = conf->chunk_sectors * (conf->raid_disks -
+ conf->max_degraded);
+ sector_t start = *offset;
+ sector_t end = start + *sectors;
+ int dd_idx;
+
+ start = round_down(start, sectors_per_chunk);
+ end = round_up(end, sectors_per_chunk);
+
+ *offset = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
+ *sectors = raid5_compute_sector(conf, end, 0, &dd_idx, NULL) - *offset;
+}
+
/*
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
@@ -8972,6 +8989,7 @@ static struct md_personality raid6_personality =
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid5_personality =
{
@@ -8997,6 +9015,7 @@ static struct md_personality raid5_personality =
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid4_personality =
@@ -9023,6 +9042,7 @@ static struct md_personality raid4_personality =
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static int __init raid5_init(void)
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v2 md-6.14 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer
2024-12-18 12:17 [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
` (3 preceding siblings ...)
2024-12-18 12:17 ` [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector() yukuai
@ 2024-12-18 12:17 ` yukuai
2025-01-02 4:36 ` Xiao Ni
4 siblings, 1 reply; 14+ messages in thread
From: yukuai @ 2024-12-18 12:17 UTC (permalink / raw)
To: song, yukuai3; +Cc: linux-raid, linux-kernel, yi.zhang, yangerkun, Yu Kuai
From: Yu Kuai <yukuai3@huawei.com>
There are two BUG reports that raid5 will hang at
bitmap_startwrite([1],[2]), root cause is that bitmap start write and end
write is unbalanced. For example, handle_stripe_clean_event() doesn't
check if stripe->dev[].towrite is NULL after tag 'returnbi', and extra
bitmap_endwrite() will be called.
While reviewing raid5 code, it's found that bitmap operations can be
optimized. For example, for a 4 disks raid5, with chunksize=8k, if user
issue a IO (0 + 48k) to the array:
┌────────────────────────────────────────────────────────────┐
│chunk 0 │
│ ┌────────────┬─────────────┬─────────────┬────────────┼
│ sh0 │A0: 0 + 4k │A1: 8k + 4k │A2: 16k + 4k │A3: P │
│ ┼────────────┼─────────────┼─────────────┼────────────┼
│ sh1 │B0: 4k + 4k │B1: 12k + 4k │B2: 20k + 4k │B3: P │
┼──────┴────────────┴─────────────┴─────────────┴────────────┼
│chunk 1 │
│ ┌────────────┬─────────────┬─────────────┬────────────┤
│ sh2 │C0: 24k + 4k│C1: 32k + 4k │C2: P │C3: 40k + 4k│
│ ┼────────────┼─────────────┼─────────────┼────────────┼
│ sh3 │D0: 28k + 4k│D1: 36k + 4k │D2: P │D3: 44k + 4k│
└──────┴────────────┴─────────────┴─────────────┴────────────┘
Before this patch, 4 stripe head will be used, and each sh will attach
bio for 3 disks, and each attached bio will trigger
bitmap_startwrite() once, which means total 12 times.
- 3 times (0 + 4k), for (A0, A1 and A2)
- 3 times (4 + 4k), for (B0, B1 and B2)
- 3 times (8 + 4k), for (C0, C1 and C3)
- 3 times (12 + 4k), for (D0, D1 and D3)
After this patch, md upper layer will calculate that IO range (0 + 48k)
is corresponding to the bitmap (0 + 16k), and call bitmap_startwrite()
just once.
Noted that this patch will align bitmap ranges to the chunks, for example,
if user issue a IO (0 + 4k) to array:
- Before this patch, 1 time (0 + 4k), for A0;
- After this patch, 1 time (0 + 8k) for chunk 0;
Usually, one bitmap bit will represent more than one disk chunk, and this
doesn't have any difference. And even if user really created a array
that one chunk contain multiple bits, the overhead is that more data
will be recovered after power failure.
[1] https://lore.kernel.org/all/CAJpMwyjmHQLvm6zg1cmQErttNNQPDAAXPKM3xgTjMhbfts986Q@mail.gmail.com/
[2] https://lore.kernel.org/all/ADF7D720-5764-4AF3-B68E-1845988737AA@flyingcircus.io/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Yu Kuai <yukuai@kernel.org>
---
drivers/md/md.c | 29 +++++++++++++++++++++++++++++
drivers/md/md.h | 2 ++
drivers/md/raid1.c | 4 ----
drivers/md/raid10.c | 3 ---
drivers/md/raid5-cache.c | 2 --
drivers/md/raid5.c | 24 +-----------------------
6 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aebe12b0ee27..c60ae2c70102 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8745,12 +8745,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static void md_bitmap_start(struct mddev *mddev,
+ struct md_io_clone *md_io_clone)
+{
+ if (mddev->pers->bitmap_sector)
+ mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
+ &md_io_clone->sectors);
+
+ mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
+static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
+{
+ mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8775,6 +8795,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
+ if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
+ md_io_clone->offset = (*bio)->bi_iter.bi_sector;
+ md_io_clone->sectors = bio_sectors(*bio);
+ md_bitmap_start(mddev, md_io_clone);
+ }
+
clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
@@ -8793,6 +8819,9 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index de6dadb9a40b..def808064ad8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -831,6 +831,8 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
+ sector_t offset;
+ unsigned long sectors;
struct bio bio_clone;
};
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 81dff2cea0db..b5a5766cccf7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -422,8 +422,6 @@ static void close_write(struct r1bio *r1_bio)
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->end_behind_write(mddev);
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
md_write_end(mddev);
}
@@ -1647,8 +1645,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->start_behind_write(mddev);
- mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
- r1_bio->sectors);
first_clone = 0;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3dc0170125b2..2fe8e6f96057 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -428,8 +428,6 @@ static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
md_write_end(mddev);
}
@@ -1517,7 +1515,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index ba4f9577c737..011246e16a99 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -313,8 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
}
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b2fe201b599d..017439e2af03 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3578,12 +3578,6 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
* any more.
*/
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf));
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
@@ -3638,7 +3632,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3663,8 +3656,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
@@ -3679,10 +3670,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3691,7 +3678,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3725,9 +3711,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4076,8 +4059,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
bio_endio(wbi);
wbi = wbi2;
}
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
+
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -5797,10 +5779,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0; d < conf->raid_disks - conf->max_degraded;
- d++)
- mddev->bitmap_ops->startwrite(mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf));
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite()
2024-12-18 12:17 ` [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite() yukuai
@ 2024-12-23 7:31 ` Xiao Ni
2024-12-23 7:50 ` Yu Kuai
0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2024-12-23 7:31 UTC (permalink / raw)
To: yukuai; +Cc: song, yukuai3, linux-raid, linux-kernel, yi.zhang, yangerkun
On Wed, Dec 18, 2024 at 8:21 PM <yukuai@kernel.org> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> It is useless, because for the case IO failed for one rdev:
>
> - If badblocks is set and rdev is not faulty, there is no need to
> mark the bit as NEEDED;
Hi Kuai
It's better to add some comments before here. Before this patch, it's
easy to understand. It needs to set bitmap bit when a write request
fails. With this patch, there are some hidden logics here. To me, it's
not easy to maintain in the future. And in man mdadm, there is no-bbl
option, so it looks like an array may not have a bad block. And I
don't know how dmraid maintain badblock. So this patch needs to be
more careful.
Regards
Xiao
> - If rdev is faulty, then mddev->degraded will be set, and we can use
> it to mard the bit as NEEDED;
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> Signed-off-by: Yu Kuai <yukuai@kernel.org>
> ---
> drivers/md/md-bitmap.c | 19 ++++++++++---------
> drivers/md/md-bitmap.h | 2 +-
> drivers/md/raid1.c | 3 +--
> drivers/md/raid10.c | 3 +--
> drivers/md/raid5-cache.c | 3 +--
> drivers/md/raid5.c | 9 +++------
> 6 files changed, 17 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index 84fb4cc67d5e..b40a84b01085 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -1726,7 +1726,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
> }
>
> static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
> - unsigned long sectors, bool success)
> + unsigned long sectors)
> {
> struct bitmap *bitmap = mddev->bitmap;
>
> @@ -1745,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
> return;
> }
>
> - if (success && !bitmap->mddev->degraded &&
> - bitmap->events_cleared < bitmap->mddev->events) {
> - bitmap->events_cleared = bitmap->mddev->events;
> - bitmap->need_sync = 1;
> - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
> - }
> -
> - if (!success && !NEEDED(*bmc))
> + if (!bitmap->mddev->degraded) {
> + if (bitmap->events_cleared < bitmap->mddev->events) {
> + bitmap->events_cleared = bitmap->mddev->events;
> + bitmap->need_sync = 1;
> + sysfs_notify_dirent_safe(
> + bitmap->sysfs_can_clear);
> + }
> + } else if (!NEEDED(*bmc)) {
> *bmc |= NEEDED_MASK;
> + }
>
> if (COUNTER(*bmc) == COUNTER_MAX)
> wake_up(&bitmap->overflow_wait);
> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> index e87a1f493d3c..31c93019c76b 100644
> --- a/drivers/md/md-bitmap.h
> +++ b/drivers/md/md-bitmap.h
> @@ -92,7 +92,7 @@ struct bitmap_operations {
> int (*startwrite)(struct mddev *mddev, sector_t offset,
> unsigned long sectors);
> void (*endwrite)(struct mddev *mddev, sector_t offset,
> - unsigned long sectors, bool success);
> + unsigned long sectors);
> bool (*start_sync)(struct mddev *mddev, sector_t offset,
> sector_t *blocks, bool degraded);
> void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 15ba7a001f30..81dff2cea0db 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -423,8 +423,7 @@ static void close_write(struct r1bio *r1_bio)
> if (test_bit(R1BIO_BehindIO, &r1_bio->state))
> mddev->bitmap_ops->end_behind_write(mddev);
> /* clear the bitmap if all writes complete successfully */
> - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
> - !test_bit(R1BIO_Degraded, &r1_bio->state));
> + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
> md_write_end(mddev);
> }
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index c3a93b2a26a6..3dc0170125b2 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -429,8 +429,7 @@ static void close_write(struct r10bio *r10_bio)
> struct mddev *mddev = r10_bio->mddev;
>
> /* clear the bitmap if all writes complete successfully */
> - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
> - !test_bit(R10BIO_Degraded, &r10_bio->state));
> + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
> md_write_end(mddev);
> }
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 4c7ecdd5c1f3..ba4f9577c737 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -314,8 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
> set_bit(R5_UPTODATE, &sh->dev[i].flags);
> r5c_return_dev_pending_writes(conf, &sh->dev[i]);
> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> - !test_bit(STRIPE_DEGRADED, &sh->state));
> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> }
> }
> }
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 93cc7e252dd4..6eb2841ce28c 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3664,8 +3664,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> }
> if (bitmap_end)
> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> - false);
> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> bitmap_end = 0;
> /* and fail all 'written' */
> bi = sh->dev[i].written;
> @@ -3711,8 +3710,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> }
> if (bitmap_end)
> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> - false);
> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> /* If we were in the middle of a write the parity block might
> * still be locked - so just clear all R5_LOCKED flags
> */
> @@ -4062,8 +4060,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
> wbi = wbi2;
> }
> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> - !test_bit(STRIPE_DEGRADED, &sh->state));
> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> if (head_sh->batch_head) {
> sh = list_first_entry(&sh->batch_list,
> struct stripe_head,
> --
> 2.43.0
>
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite()
2024-12-23 7:31 ` Xiao Ni
@ 2024-12-23 7:50 ` Yu Kuai
2024-12-30 5:01 ` Xiao Ni
0 siblings, 1 reply; 14+ messages in thread
From: Yu Kuai @ 2024-12-23 7:50 UTC (permalink / raw)
To: Xiao Ni, yukuai
Cc: song, linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)
Hi,
在 2024/12/23 15:31, Xiao Ni 写道:
> On Wed, Dec 18, 2024 at 8:21 PM <yukuai@kernel.org> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> It is useless, because for the case IO failed for one rdev:
>>
>> - If badblocks is set and rdev is not faulty, there is no need to
>> mark the bit as NEEDED;
>
>
> Hi Kuai
>
> It's better to add some comments before here. Before this patch, it's
> easy to understand. It needs to set bitmap bit when a write request
> fails.
This is not accurate, it's doesn't matter if IO fails or succeed, bit
must be set if data is not consistent, either IO is not done yet, or the
array is degaraded.
> With this patch, there are some hidden logics here. To me, it's
> not easy to maintain in the future. And in man mdadm, there is no-bbl
> option, so it looks like an array may not have a bad block. And I
> don't know how dmraid maintain badblock. So this patch needs to be
> more careful.
no-bbl is one of the option of mdadm --update, I think it means remove
current badblock entries, not that disable badblocks.
In kernel, badblocks is always enabled by default, and IO error will
always try to set badblocks first. For example:
- badblocks_init() is called from md_rdev_init(), and if
badblocks_init() fails, the array can't be assembled.
- The only thing stop rdev to record badblocks after IO failure is that
rdev is faulty.
Thanks,
Kuai
>
> Regards
> Xiao
>
>> - If rdev is faulty, then mddev->degraded will be set, and we can use
>> it to mard the bit as NEEDED;
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> Signed-off-by: Yu Kuai <yukuai@kernel.org>
>> ---
>> drivers/md/md-bitmap.c | 19 ++++++++++---------
>> drivers/md/md-bitmap.h | 2 +-
>> drivers/md/raid1.c | 3 +--
>> drivers/md/raid10.c | 3 +--
>> drivers/md/raid5-cache.c | 3 +--
>> drivers/md/raid5.c | 9 +++------
>> 6 files changed, 17 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
>> index 84fb4cc67d5e..b40a84b01085 100644
>> --- a/drivers/md/md-bitmap.c
>> +++ b/drivers/md/md-bitmap.c
>> @@ -1726,7 +1726,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
>> }
>>
>> static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
>> - unsigned long sectors, bool success)
>> + unsigned long sectors)
>> {
>> struct bitmap *bitmap = mddev->bitmap;
>>
>> @@ -1745,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
>> return;
>> }
>>
>> - if (success && !bitmap->mddev->degraded &&
>> - bitmap->events_cleared < bitmap->mddev->events) {
>> - bitmap->events_cleared = bitmap->mddev->events;
>> - bitmap->need_sync = 1;
>> - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
>> - }
>> -
>> - if (!success && !NEEDED(*bmc))
>> + if (!bitmap->mddev->degraded) {
>> + if (bitmap->events_cleared < bitmap->mddev->events) {
>> + bitmap->events_cleared = bitmap->mddev->events;
>> + bitmap->need_sync = 1;
>> + sysfs_notify_dirent_safe(
>> + bitmap->sysfs_can_clear);
>> + }
>> + } else if (!NEEDED(*bmc)) {
>> *bmc |= NEEDED_MASK;
>> + }
>>
>> if (COUNTER(*bmc) == COUNTER_MAX)
>> wake_up(&bitmap->overflow_wait);
>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
>> index e87a1f493d3c..31c93019c76b 100644
>> --- a/drivers/md/md-bitmap.h
>> +++ b/drivers/md/md-bitmap.h
>> @@ -92,7 +92,7 @@ struct bitmap_operations {
>> int (*startwrite)(struct mddev *mddev, sector_t offset,
>> unsigned long sectors);
>> void (*endwrite)(struct mddev *mddev, sector_t offset,
>> - unsigned long sectors, bool success);
>> + unsigned long sectors);
>> bool (*start_sync)(struct mddev *mddev, sector_t offset,
>> sector_t *blocks, bool degraded);
>> void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 15ba7a001f30..81dff2cea0db 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -423,8 +423,7 @@ static void close_write(struct r1bio *r1_bio)
>> if (test_bit(R1BIO_BehindIO, &r1_bio->state))
>> mddev->bitmap_ops->end_behind_write(mddev);
>> /* clear the bitmap if all writes complete successfully */
>> - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
>> - !test_bit(R1BIO_Degraded, &r1_bio->state));
>> + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
>> md_write_end(mddev);
>> }
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index c3a93b2a26a6..3dc0170125b2 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -429,8 +429,7 @@ static void close_write(struct r10bio *r10_bio)
>> struct mddev *mddev = r10_bio->mddev;
>>
>> /* clear the bitmap if all writes complete successfully */
>> - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
>> - !test_bit(R10BIO_Degraded, &r10_bio->state));
>> + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
>> md_write_end(mddev);
>> }
>>
>> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
>> index 4c7ecdd5c1f3..ba4f9577c737 100644
>> --- a/drivers/md/raid5-cache.c
>> +++ b/drivers/md/raid5-cache.c
>> @@ -314,8 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
>> set_bit(R5_UPTODATE, &sh->dev[i].flags);
>> r5c_return_dev_pending_writes(conf, &sh->dev[i]);
>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>> - !test_bit(STRIPE_DEGRADED, &sh->state));
>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>> }
>> }
>> }
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 93cc7e252dd4..6eb2841ce28c 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -3664,8 +3664,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>> }
>> if (bitmap_end)
>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>> - false);
>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>> bitmap_end = 0;
>> /* and fail all 'written' */
>> bi = sh->dev[i].written;
>> @@ -3711,8 +3710,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>> }
>> if (bitmap_end)
>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>> - false);
>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>> /* If we were in the middle of a write the parity block might
>> * still be locked - so just clear all R5_LOCKED flags
>> */
>> @@ -4062,8 +4060,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
>> wbi = wbi2;
>> }
>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>> - !test_bit(STRIPE_DEGRADED, &sh->state));
>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>> if (head_sh->batch_head) {
>> sh = list_first_entry(&sh->batch_list,
>> struct stripe_head,
>> --
>> 2.43.0
>>
>>
>
>
> .
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite()
2024-12-23 7:50 ` Yu Kuai
@ 2024-12-30 5:01 ` Xiao Ni
2025-01-02 1:17 ` Yu Kuai
0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2024-12-30 5:01 UTC (permalink / raw)
To: Yu Kuai
Cc: yukuai, song, linux-raid, linux-kernel, yi.zhang, yangerkun,
yukuai (C)
On Mon, Dec 23, 2024 at 3:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2024/12/23 15:31, Xiao Ni 写道:
> > On Wed, Dec 18, 2024 at 8:21 PM <yukuai@kernel.org> wrote:
> >>
> >> From: Yu Kuai <yukuai3@huawei.com>
> >>
> >> It is useless, because for the case IO failed for one rdev:
> >>
> >> - If badblocks is set and rdev is not faulty, there is no need to
> >> mark the bit as NEEDED;
> >
> >
> > Hi Kuai
> >
> > It's better to add some comments before here. Before this patch, it's
> > easy to understand. It needs to set bitmap bit when a write request
> > fails.
Hi Kuai
>
> This is not accurate, it's doesn't matter if IO fails or succeed, bit
> must be set if data is not consistent, either IO is not done yet, or the
> array is degaraded.
Sorry for the wrong words. I want to say bitmap NEEDED bit is set when
a write request fails. After this patch, we can't see the logic
directly. So it's a hidden logic. It's better to add more comments
here for future maintenance.
And I read the codes, R1BIO_Degraded, STRIPE_DEGRADED,
R10BIO_Degraded, these three flags are only used to tell bitmap if it
needs to set NEEDED bit. After this patch, it looks like these flags
are not useful anymore.
>
> > With this patch, there are some hidden logics here. To me, it's
> > not easy to maintain in the future. And in man mdadm, there is no-bbl
> > option, so it looks like an array may not have a bad block. And I
> > don't know how dmraid maintain badblock. So this patch needs to be
> > more careful.
>
> no-bbl is one of the option of mdadm --update, I think it means remove
> current badblock entries, not that disable badblocks.
>
> In kernel, badblocks is always enabled by default, and IO error will
> always try to set badblocks first. For example:
>
> - badblocks_init() is called from md_rdev_init(), and if
> badblocks_init() fails, the array can't be assembled.
> - The only thing stop rdev to record badblocks after IO failure is that
> rdev is faulty.
Yes, thanks for pointing out this.
>
> Thanks,
> Kuai
>
> >
> > Regards
> > Xiao
> >
> >> - If rdev is faulty, then mddev->degraded will be set, and we can use
> >> it to mard the bit as NEEDED;
> >>
> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> >> Signed-off-by: Yu Kuai <yukuai@kernel.org>
> >> ---
> >> drivers/md/md-bitmap.c | 19 ++++++++++---------
> >> drivers/md/md-bitmap.h | 2 +-
> >> drivers/md/raid1.c | 3 +--
> >> drivers/md/raid10.c | 3 +--
> >> drivers/md/raid5-cache.c | 3 +--
> >> drivers/md/raid5.c | 9 +++------
> >> 6 files changed, 17 insertions(+), 22 deletions(-)
> >>
> >> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> >> index 84fb4cc67d5e..b40a84b01085 100644
> >> --- a/drivers/md/md-bitmap.c
> >> +++ b/drivers/md/md-bitmap.c
> >> @@ -1726,7 +1726,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
> >> }
> >>
> >> static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
> >> - unsigned long sectors, bool success)
> >> + unsigned long sectors)
> >> {
> >> struct bitmap *bitmap = mddev->bitmap;
> >>
> >> @@ -1745,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
> >> return;
> >> }
> >>
> >> - if (success && !bitmap->mddev->degraded &&
> >> - bitmap->events_cleared < bitmap->mddev->events) {
> >> - bitmap->events_cleared = bitmap->mddev->events;
> >> - bitmap->need_sync = 1;
> >> - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
> >> - }
> >> -
> >> - if (!success && !NEEDED(*bmc))
> >> + if (!bitmap->mddev->degraded) {
> >> + if (bitmap->events_cleared < bitmap->mddev->events) {
> >> + bitmap->events_cleared = bitmap->mddev->events;
> >> + bitmap->need_sync = 1;
> >> + sysfs_notify_dirent_safe(
> >> + bitmap->sysfs_can_clear);
> >> + }
> >> + } else if (!NEEDED(*bmc)) {
> >> *bmc |= NEEDED_MASK;
> >> + }
> >>
> >> if (COUNTER(*bmc) == COUNTER_MAX)
> >> wake_up(&bitmap->overflow_wait);
> >> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
> >> index e87a1f493d3c..31c93019c76b 100644
> >> --- a/drivers/md/md-bitmap.h
> >> +++ b/drivers/md/md-bitmap.h
> >> @@ -92,7 +92,7 @@ struct bitmap_operations {
> >> int (*startwrite)(struct mddev *mddev, sector_t offset,
> >> unsigned long sectors);
> >> void (*endwrite)(struct mddev *mddev, sector_t offset,
> >> - unsigned long sectors, bool success);
> >> + unsigned long sectors);
> >> bool (*start_sync)(struct mddev *mddev, sector_t offset,
> >> sector_t *blocks, bool degraded);
> >> void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
> >> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> >> index 15ba7a001f30..81dff2cea0db 100644
> >> --- a/drivers/md/raid1.c
> >> +++ b/drivers/md/raid1.c
> >> @@ -423,8 +423,7 @@ static void close_write(struct r1bio *r1_bio)
> >> if (test_bit(R1BIO_BehindIO, &r1_bio->state))
> >> mddev->bitmap_ops->end_behind_write(mddev);
> >> /* clear the bitmap if all writes complete successfully */
> >> - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
> >> - !test_bit(R1BIO_Degraded, &r1_bio->state));
> >> + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
> >> md_write_end(mddev);
> >> }
> >>
> >> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> >> index c3a93b2a26a6..3dc0170125b2 100644
> >> --- a/drivers/md/raid10.c
> >> +++ b/drivers/md/raid10.c
> >> @@ -429,8 +429,7 @@ static void close_write(struct r10bio *r10_bio)
> >> struct mddev *mddev = r10_bio->mddev;
> >>
> >> /* clear the bitmap if all writes complete successfully */
> >> - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
> >> - !test_bit(R10BIO_Degraded, &r10_bio->state));
> >> + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
> >> md_write_end(mddev);
> >> }
> >>
> >> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> >> index 4c7ecdd5c1f3..ba4f9577c737 100644
> >> --- a/drivers/md/raid5-cache.c
> >> +++ b/drivers/md/raid5-cache.c
> >> @@ -314,8 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
> >> set_bit(R5_UPTODATE, &sh->dev[i].flags);
> >> r5c_return_dev_pending_writes(conf, &sh->dev[i]);
> >> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> >> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> >> - !test_bit(STRIPE_DEGRADED, &sh->state));
> >> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> >> }
> >> }
> >> }
> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >> index 93cc7e252dd4..6eb2841ce28c 100644
> >> --- a/drivers/md/raid5.c
> >> +++ b/drivers/md/raid5.c
> >> @@ -3664,8 +3664,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> >> }
> >> if (bitmap_end)
> >> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> >> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> >> - false);
> >> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> >> bitmap_end = 0;
> >> /* and fail all 'written' */
> >> bi = sh->dev[i].written;
> >> @@ -3711,8 +3710,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> >> }
> >> if (bitmap_end)
> >> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> >> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> >> - false);
> >> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> >> /* If we were in the middle of a write the parity block might
> >> * still be locked - so just clear all R5_LOCKED flags
> >> */
> >> @@ -4062,8 +4060,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
> >> wbi = wbi2;
> >> }
> >> conf->mddev->bitmap_ops->endwrite(conf->mddev,
> >> - sh->sector, RAID5_STRIPE_SECTORS(conf),
> >> - !test_bit(STRIPE_DEGRADED, &sh->state));
> >> + sh->sector, RAID5_STRIPE_SECTORS(conf));
> >> if (head_sh->batch_head) {
> >> sh = list_first_entry(&sh->batch_list,
> >> struct stripe_head,
> >> --
> >> 2.43.0
> >>
> >>
> >
> >
> > .
> >
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector()
2024-12-18 12:17 ` [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector() yukuai
@ 2025-01-01 5:36 ` Xiao Ni
2025-01-02 1:21 ` Yu Kuai
0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2025-01-01 5:36 UTC (permalink / raw)
To: yukuai; +Cc: song, yukuai3, linux-raid, linux-kernel, yi.zhang, yangerkun
On Wed, Dec 18, 2024 at 8:22 PM <yukuai@kernel.org> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Bitmap is used for the whole array for raid1/raid10, hence IO for the
> array can be used directly for bitmap. However, bitmap is used for
> underlying disks for raid5, hence IO for the array can't be used
> directly for bitmap.
>
> Implement pers->bitmap_sector() for raid5 to convert IO ranges from the
> array to the underlying disks.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> Signed-off-by: Yu Kuai <yukuai@kernel.org>
> ---
> drivers/md/raid5.c | 20 ++++++++++++++++++++
> 1 file changed, 20 insertions(+)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 6eb2841ce28c..b2fe201b599d 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -2950,6 +2950,23 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
> r5c_update_on_rdev_error(mddev, rdev);
> }
>
> +static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
> + unsigned long *sectors)
> +{
> + struct r5conf *conf = mddev->private;
> + int sectors_per_chunk = conf->chunk_sectors * (conf->raid_disks -
> + conf->max_degraded);
> + sector_t start = *offset;
> + sector_t end = start + *sectors;
> + int dd_idx;
> +
> + start = round_down(start, sectors_per_chunk);
> + end = round_up(end, sectors_per_chunk);
> +
> + *offset = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
> + *sectors = raid5_compute_sector(conf, end, 0, &dd_idx, NULL) - *offset;
> +}
Hi Kuai
This function needs to think about reshape like make_stripe_request does.
Regards
Xiao
> +
> /*
> * Input: a 'big' sector number,
> * Output: index of the data and parity disk, and the sector # in them.
> @@ -8972,6 +8989,7 @@ static struct md_personality raid6_personality =
> .takeover = raid6_takeover,
> .change_consistency_policy = raid5_change_consistency_policy,
> .prepare_suspend = raid5_prepare_suspend,
> + .bitmap_sector = raid5_bitmap_sector,
> };
> static struct md_personality raid5_personality =
> {
> @@ -8997,6 +9015,7 @@ static struct md_personality raid5_personality =
> .takeover = raid5_takeover,
> .change_consistency_policy = raid5_change_consistency_policy,
> .prepare_suspend = raid5_prepare_suspend,
> + .bitmap_sector = raid5_bitmap_sector,
> };
>
> static struct md_personality raid4_personality =
> @@ -9023,6 +9042,7 @@ static struct md_personality raid4_personality =
> .takeover = raid4_takeover,
> .change_consistency_policy = raid5_change_consistency_policy,
> .prepare_suspend = raid5_prepare_suspend,
> + .bitmap_sector = raid5_bitmap_sector,
> };
>
> static int __init raid5_init(void)
> --
> 2.43.0
>
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite()
2024-12-30 5:01 ` Xiao Ni
@ 2025-01-02 1:17 ` Yu Kuai
0 siblings, 0 replies; 14+ messages in thread
From: Yu Kuai @ 2025-01-02 1:17 UTC (permalink / raw)
To: Xiao Ni, Yu Kuai
Cc: yukuai, song, linux-raid, linux-kernel, yi.zhang, yangerkun,
yukuai (C)
Hi,
在 2024/12/30 13:01, Xiao Ni 写道:
> On Mon, Dec 23, 2024 at 3:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi,
>>
>> 在 2024/12/23 15:31, Xiao Ni 写道:
>>> On Wed, Dec 18, 2024 at 8:21 PM <yukuai@kernel.org> wrote:
>>>>
>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>
>>>> It is useless, because for the case IO failed for one rdev:
>>>>
>>>> - If badblocks is set and rdev is not faulty, there is no need to
>>>> mark the bit as NEEDED;
>>>
>>>
>>> Hi Kuai
>>>
>>> It's better to add some comments before here. Before this patch, it's
>>> easy to understand. It needs to set bitmap bit when a write request
>>> fails.
>
> Hi Kuai
>
>>
>> This is not accurate, it's doesn't matter if IO fails or succeed, bit
>> must be set if data is not consistent, either IO is not done yet, or the
>> array is degaraded.
>
> Sorry for the wrong words. I want to say bitmap NEEDED bit is set when
> a write request fails. After this patch, we can't see the logic
> directly. So it's a hidden logic. It's better to add more comments
> here for future maintenance.
Ok.
>
> And I read the codes, R1BIO_Degraded, STRIPE_DEGRADED,
> R10BIO_Degraded, these three flags are only used to tell bitmap if it
> needs to set NEEDED bit. After this patch, it looks like these flags
> are not useful anymore.
Yes, the xxx_DEGRADED flag is useless now and can be cleaned up.
Thanks,
Kuai
>
>>
>>> With this patch, there are some hidden logics here. To me, it's
>>> not easy to maintain in the future. And in man mdadm, there is no-bbl
>>> option, so it looks like an array may not have a bad block. And I
>>> don't know how dmraid maintain badblock. So this patch needs to be
>>> more careful.
>>
>> no-bbl is one of the option of mdadm --update, I think it means remove
>> current badblock entries, not that disable badblocks.
>>
>> In kernel, badblocks is always enabled by default, and IO error will
>> always try to set badblocks first. For example:
>>
>> - badblocks_init() is called from md_rdev_init(), and if
>> badblocks_init() fails, the array can't be assembled.
>> - The only thing stop rdev to record badblocks after IO failure is that
>> rdev is faulty.
>
> Yes, thanks for pointing out this.
>>
>> Thanks,
>> Kuai
>>
>>>
>>> Regards
>>> Xiao
>>>
>>>> - If rdev is faulty, then mddev->degraded will be set, and we can use
>>>> it to mard the bit as NEEDED;
>>>>
>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>> Signed-off-by: Yu Kuai <yukuai@kernel.org>
>>>> ---
>>>> drivers/md/md-bitmap.c | 19 ++++++++++---------
>>>> drivers/md/md-bitmap.h | 2 +-
>>>> drivers/md/raid1.c | 3 +--
>>>> drivers/md/raid10.c | 3 +--
>>>> drivers/md/raid5-cache.c | 3 +--
>>>> drivers/md/raid5.c | 9 +++------
>>>> 6 files changed, 17 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
>>>> index 84fb4cc67d5e..b40a84b01085 100644
>>>> --- a/drivers/md/md-bitmap.c
>>>> +++ b/drivers/md/md-bitmap.c
>>>> @@ -1726,7 +1726,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
>>>> }
>>>>
>>>> static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
>>>> - unsigned long sectors, bool success)
>>>> + unsigned long sectors)
>>>> {
>>>> struct bitmap *bitmap = mddev->bitmap;
>>>>
>>>> @@ -1745,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
>>>> return;
>>>> }
>>>>
>>>> - if (success && !bitmap->mddev->degraded &&
>>>> - bitmap->events_cleared < bitmap->mddev->events) {
>>>> - bitmap->events_cleared = bitmap->mddev->events;
>>>> - bitmap->need_sync = 1;
>>>> - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
>>>> - }
>>>> -
>>>> - if (!success && !NEEDED(*bmc))
>>>> + if (!bitmap->mddev->degraded) {
>>>> + if (bitmap->events_cleared < bitmap->mddev->events) {
>>>> + bitmap->events_cleared = bitmap->mddev->events;
>>>> + bitmap->need_sync = 1;
>>>> + sysfs_notify_dirent_safe(
>>>> + bitmap->sysfs_can_clear);
>>>> + }
>>>> + } else if (!NEEDED(*bmc)) {
>>>> *bmc |= NEEDED_MASK;
>>>> + }
>>>>
>>>> if (COUNTER(*bmc) == COUNTER_MAX)
>>>> wake_up(&bitmap->overflow_wait);
>>>> diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
>>>> index e87a1f493d3c..31c93019c76b 100644
>>>> --- a/drivers/md/md-bitmap.h
>>>> +++ b/drivers/md/md-bitmap.h
>>>> @@ -92,7 +92,7 @@ struct bitmap_operations {
>>>> int (*startwrite)(struct mddev *mddev, sector_t offset,
>>>> unsigned long sectors);
>>>> void (*endwrite)(struct mddev *mddev, sector_t offset,
>>>> - unsigned long sectors, bool success);
>>>> + unsigned long sectors);
>>>> bool (*start_sync)(struct mddev *mddev, sector_t offset,
>>>> sector_t *blocks, bool degraded);
>>>> void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
>>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>>> index 15ba7a001f30..81dff2cea0db 100644
>>>> --- a/drivers/md/raid1.c
>>>> +++ b/drivers/md/raid1.c
>>>> @@ -423,8 +423,7 @@ static void close_write(struct r1bio *r1_bio)
>>>> if (test_bit(R1BIO_BehindIO, &r1_bio->state))
>>>> mddev->bitmap_ops->end_behind_write(mddev);
>>>> /* clear the bitmap if all writes complete successfully */
>>>> - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
>>>> - !test_bit(R1BIO_Degraded, &r1_bio->state));
>>>> + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
>>>> md_write_end(mddev);
>>>> }
>>>>
>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>> index c3a93b2a26a6..3dc0170125b2 100644
>>>> --- a/drivers/md/raid10.c
>>>> +++ b/drivers/md/raid10.c
>>>> @@ -429,8 +429,7 @@ static void close_write(struct r10bio *r10_bio)
>>>> struct mddev *mddev = r10_bio->mddev;
>>>>
>>>> /* clear the bitmap if all writes complete successfully */
>>>> - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
>>>> - !test_bit(R10BIO_Degraded, &r10_bio->state));
>>>> + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
>>>> md_write_end(mddev);
>>>> }
>>>>
>>>> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
>>>> index 4c7ecdd5c1f3..ba4f9577c737 100644
>>>> --- a/drivers/md/raid5-cache.c
>>>> +++ b/drivers/md/raid5-cache.c
>>>> @@ -314,8 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
>>>> set_bit(R5_UPTODATE, &sh->dev[i].flags);
>>>> r5c_return_dev_pending_writes(conf, &sh->dev[i]);
>>>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>>>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>>>> - !test_bit(STRIPE_DEGRADED, &sh->state));
>>>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>>>> }
>>>> }
>>>> }
>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> index 93cc7e252dd4..6eb2841ce28c 100644
>>>> --- a/drivers/md/raid5.c
>>>> +++ b/drivers/md/raid5.c
>>>> @@ -3664,8 +3664,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>>>> }
>>>> if (bitmap_end)
>>>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>>>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>>>> - false);
>>>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>>>> bitmap_end = 0;
>>>> /* and fail all 'written' */
>>>> bi = sh->dev[i].written;
>>>> @@ -3711,8 +3710,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>>>> }
>>>> if (bitmap_end)
>>>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>>>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>>>> - false);
>>>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>>>> /* If we were in the middle of a write the parity block might
>>>> * still be locked - so just clear all R5_LOCKED flags
>>>> */
>>>> @@ -4062,8 +4060,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
>>>> wbi = wbi2;
>>>> }
>>>> conf->mddev->bitmap_ops->endwrite(conf->mddev,
>>>> - sh->sector, RAID5_STRIPE_SECTORS(conf),
>>>> - !test_bit(STRIPE_DEGRADED, &sh->state));
>>>> + sh->sector, RAID5_STRIPE_SECTORS(conf));
>>>> if (head_sh->batch_head) {
>>>> sh = list_first_entry(&sh->batch_list,
>>>> struct stripe_head,
>>>> --
>>>> 2.43.0
>>>>
>>>>
>>>
>>>
>>> .
>>>
>>
>
>
> .
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector()
2025-01-01 5:36 ` Xiao Ni
@ 2025-01-02 1:21 ` Yu Kuai
0 siblings, 0 replies; 14+ messages in thread
From: Yu Kuai @ 2025-01-02 1:21 UTC (permalink / raw)
To: Xiao Ni, yukuai
Cc: song, linux-raid, linux-kernel, yi.zhang, yangerkun, yukuai (C)
Hi,
在 2025/01/01 13:36, Xiao Ni 写道:
> On Wed, Dec 18, 2024 at 8:22 PM <yukuai@kernel.org> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Bitmap is used for the whole array for raid1/raid10, hence IO for the
>> array can be used directly for bitmap. However, bitmap is used for
>> underlying disks for raid5, hence IO for the array can't be used
>> directly for bitmap.
>>
>> Implement pers->bitmap_sector() for raid5 to convert IO ranges from the
>> array to the underlying disks.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> Signed-off-by: Yu Kuai <yukuai@kernel.org>
>> ---
>> drivers/md/raid5.c | 20 ++++++++++++++++++++
>> 1 file changed, 20 insertions(+)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 6eb2841ce28c..b2fe201b599d 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -2950,6 +2950,23 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
>> r5c_update_on_rdev_error(mddev, rdev);
>> }
>>
>> +static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
>> + unsigned long *sectors)
>> +{
>> + struct r5conf *conf = mddev->private;
>> + int sectors_per_chunk = conf->chunk_sectors * (conf->raid_disks -
>> + conf->max_degraded);
>> + sector_t start = *offset;
>> + sector_t end = start + *sectors;
>> + int dd_idx;
>> +
>> + start = round_down(start, sectors_per_chunk);
>> + end = round_up(end, sectors_per_chunk);
>> +
>> + *offset = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
>> + *sectors = raid5_compute_sector(conf, end, 0, &dd_idx, NULL) - *offset;
>> +}
>
> Hi Kuai
>
> This function needs to think about reshape like make_stripe_request does.
>
Yes, we need to handle reshape. However, we can't retry like
make_stripe_request(). I guess I'll compute sector for both before
reshape and after reshape, and set the bits together.
Thanks,
Kuai
> Regards
> Xiao
>> +
>> /*
>> * Input: a 'big' sector number,
>> * Output: index of the data and parity disk, and the sector # in them.
>> @@ -8972,6 +8989,7 @@ static struct md_personality raid6_personality =
>> .takeover = raid6_takeover,
>> .change_consistency_policy = raid5_change_consistency_policy,
>> .prepare_suspend = raid5_prepare_suspend,
>> + .bitmap_sector = raid5_bitmap_sector,
>> };
>> static struct md_personality raid5_personality =
>> {
>> @@ -8997,6 +9015,7 @@ static struct md_personality raid5_personality =
>> .takeover = raid5_takeover,
>> .change_consistency_policy = raid5_change_consistency_policy,
>> .prepare_suspend = raid5_prepare_suspend,
>> + .bitmap_sector = raid5_bitmap_sector,
>> };
>>
>> static struct md_personality raid4_personality =
>> @@ -9023,6 +9042,7 @@ static struct md_personality raid4_personality =
>> .takeover = raid4_takeover,
>> .change_consistency_policy = raid5_change_consistency_policy,
>> .prepare_suspend = raid5_prepare_suspend,
>> + .bitmap_sector = raid5_bitmap_sector,
>> };
>>
>> static int __init raid5_init(void)
>> --
>> 2.43.0
>>
>>
>
> .
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2 md-6.14 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer
2024-12-18 12:17 ` [PATCH v2 md-6.14 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
@ 2025-01-02 4:36 ` Xiao Ni
0 siblings, 0 replies; 14+ messages in thread
From: Xiao Ni @ 2025-01-02 4:36 UTC (permalink / raw)
To: yukuai, song, yukuai3; +Cc: linux-raid, linux-kernel, yi.zhang, yangerkun
在 2024/12/18 下午8:17, yukuai@kernel.org 写道:
> From: Yu Kuai <yukuai3@huawei.com>
>
> There are two BUG reports that raid5 will hang at
> bitmap_startwrite([1],[2]), root cause is that bitmap start write and end
> write is unbalanced. For example, handle_stripe_clean_event() doesn't
> check if stripe->dev[].towrite is NULL after tag 'returnbi', and extra
> bitmap_endwrite() will be called.
Hi Kuai
bitmap startwrite is called if dev[].to_write is added a bio when
to_write is NULL. And it needs a full write when a stripe is added to
batch list. So in handle_stripe_clean_event, the dev[].written must have
value when it goto returnbi. The unbalanced case you mentioned doesn't
exist? So it should not be the root cause of the two reported problems
mentioned below.
>
> While reviewing raid5 code, it's found that bitmap operations can be
> optimized. For example, for a 4 disks raid5, with chunksize=8k, if user
> issue a IO (0 + 48k) to the array:
>
> ┌────────────────────────────────────────────────────────────┐
> │chunk 0 │
> │ ┌────────────┬─────────────┬─────────────┬────────────┼
> │ sh0 │A0: 0 + 4k │A1: 8k + 4k │A2: 16k + 4k │A3: P │
> │ ┼────────────┼─────────────┼─────────────┼────────────┼
> │ sh1 │B0: 4k + 4k │B1: 12k + 4k │B2: 20k + 4k │B3: P │
> ┼──────┴────────────┴─────────────┴─────────────┴────────────┼
> │chunk 1 │
> │ ┌────────────┬─────────────┬─────────────┬────────────┤
> │ sh2 │C0: 24k + 4k│C1: 32k + 4k │C2: P │C3: 40k + 4k│
> │ ┼────────────┼─────────────┼─────────────┼────────────┼
> │ sh3 │D0: 28k + 4k│D1: 36k + 4k │D2: P │D3: 44k + 4k│
> └──────┴────────────┴─────────────┴─────────────┴────────────┘
>
> Before this patch, 4 stripe head will be used, and each sh will attach
> bio for 3 disks, and each attached bio will trigger
> bitmap_startwrite() once, which means total 12 times.
> - 3 times (0 + 4k), for (A0, A1 and A2)
> - 3 times (4 + 4k), for (B0, B1 and B2)
> - 3 times (8 + 4k), for (C0, C1 and C3)
> - 3 times (12 + 4k), for (D0, D1 and D3)
>
> After this patch, md upper layer will calculate that IO range (0 + 48k)
> is corresponding to the bitmap (0 + 16k), and call bitmap_startwrite()
> just once.
>
> Noted that this patch will align bitmap ranges to the chunks, for example,
> if user issue a IO (0 + 4k) to array:
>
> - Before this patch, 1 time (0 + 4k), for A0;
> - After this patch, 1 time (0 + 8k) for chunk 0;
>
> Usually, one bitmap bit will represent more than one disk chunk, and this
> doesn't have any difference. And even if user really created a array
> that one chunk contain multiple bits, the overhead is that more data
> will be recovered after power failure.
>
> [1] https://lore.kernel.org/all/CAJpMwyjmHQLvm6zg1cmQErttNNQPDAAXPKM3xgTjMhbfts986Q@mail.gmail.com/
> [2] https://lore.kernel.org/all/ADF7D720-5764-4AF3-B68E-1845988737AA@flyingcircus.io/
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> Signed-off-by: Yu Kuai <yukuai@kernel.org>
> ---
> drivers/md/md.c | 29 +++++++++++++++++++++++++++++
> drivers/md/md.h | 2 ++
> drivers/md/raid1.c | 4 ----
> drivers/md/raid10.c | 3 ---
> drivers/md/raid5-cache.c | 2 --
> drivers/md/raid5.c | 24 +-----------------------
> 6 files changed, 32 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index aebe12b0ee27..c60ae2c70102 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8745,12 +8745,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
> }
> EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> +static void md_bitmap_start(struct mddev *mddev,
> + struct md_io_clone *md_io_clone)
> +{
> + if (mddev->pers->bitmap_sector)
> + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
> + &md_io_clone->sectors);
> +
> + mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
> + md_io_clone->sectors);
> +}
> +
> +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
> +{
> + mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
> + md_io_clone->sectors);
> +}
> +
> static void md_end_clone_io(struct bio *bio)
> {
> struct md_io_clone *md_io_clone = bio->bi_private;
> struct bio *orig_bio = md_io_clone->orig_bio;
> struct mddev *mddev = md_io_clone->mddev;
>
> + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
> + md_bitmap_end(mddev, md_io_clone);
> +
> if (bio->bi_status && !orig_bio->bi_status)
> orig_bio->bi_status = bio->bi_status;
>
> @@ -8775,6 +8795,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
> if (blk_queue_io_stat(bdev->bd_disk->queue))
> md_io_clone->start_time = bio_start_io_acct(*bio);
>
> + if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
> + md_io_clone->offset = (*bio)->bi_iter.bi_sector;
> + md_io_clone->sectors = bio_sectors(*bio);
> + md_bitmap_start(mddev, md_io_clone);
> + }
> +
> clone->bi_end_io = md_end_clone_io;
> clone->bi_private = md_io_clone;
> *bio = clone;
> @@ -8793,6 +8819,9 @@ void md_free_cloned_bio(struct bio *bio)
> struct bio *orig_bio = md_io_clone->orig_bio;
> struct mddev *mddev = md_io_clone->mddev;
>
> + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
> + md_bitmap_end(mddev, md_io_clone);
> +
> if (bio->bi_status && !orig_bio->bi_status)
> orig_bio->bi_status = bio->bi_status;
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index de6dadb9a40b..def808064ad8 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -831,6 +831,8 @@ struct md_io_clone {
> struct mddev *mddev;
> struct bio *orig_bio;
> unsigned long start_time;
> + sector_t offset;
> + unsigned long sectors;
> struct bio bio_clone;
> };
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 81dff2cea0db..b5a5766cccf7 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -422,8 +422,6 @@ static void close_write(struct r1bio *r1_bio)
>
> if (test_bit(R1BIO_BehindIO, &r1_bio->state))
> mddev->bitmap_ops->end_behind_write(mddev);
> - /* clear the bitmap if all writes complete successfully */
> - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
> md_write_end(mddev);
> }
>
> @@ -1647,8 +1645,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>
> if (test_bit(R1BIO_BehindIO, &r1_bio->state))
> mddev->bitmap_ops->start_behind_write(mddev);
> - mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
> - r1_bio->sectors);
> first_clone = 0;
> }
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 3dc0170125b2..2fe8e6f96057 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -428,8 +428,6 @@ static void close_write(struct r10bio *r10_bio)
> {
> struct mddev *mddev = r10_bio->mddev;
>
> - /* clear the bitmap if all writes complete successfully */
> - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
> md_write_end(mddev);
> }
>
> @@ -1517,7 +1515,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
> md_account_bio(mddev, &bio);
> r10_bio->master_bio = bio;
> atomic_set(&r10_bio->remaining, 1);
> - mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);
>
> for (i = 0; i < conf->copies; i++) {
> if (r10_bio->devs[i].bio)
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index ba4f9577c737..011246e16a99 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -313,8 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
> if (sh->dev[i].written) {
> set_bit(R5_UPTODATE, &sh->dev[i].flags);
> r5c_return_dev_pending_writes(conf, &sh->dev[i]);
> - conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf));
> }
> }
> }
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index b2fe201b599d..017439e2af03 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3578,12 +3578,6 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
> * is added to a batch, STRIPE_BIT_DELAY cannot be changed
> * any more.
> */
> - set_bit(STRIPE_BITMAP_PENDING, &sh->state);
> - spin_unlock_irq(&sh->stripe_lock);
> - conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
> - RAID5_STRIPE_SECTORS(conf));
> - spin_lock_irq(&sh->stripe_lock);
> - clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
> if (!sh->batch_head) {
> sh->bm_seq = conf->seq_flush+1;
> set_bit(STRIPE_BIT_DELAY, &sh->state);
> @@ -3638,7 +3632,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> BUG_ON(sh->batch_head);
> for (i = disks; i--; ) {
> struct bio *bi;
> - int bitmap_end = 0;
>
> if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
> struct md_rdev *rdev = conf->disks[i].rdev;
> @@ -3663,8 +3656,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> sh->dev[i].towrite = NULL;
> sh->overwrite_disks = 0;
> spin_unlock_irq(&sh->stripe_lock);
> - if (bi)
> - bitmap_end = 1;
>
> log_stripe_write_finished(sh);
>
> @@ -3679,10 +3670,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> bio_io_error(bi);
> bi = nextbi;
> }
> - if (bitmap_end)
> - conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf));
> - bitmap_end = 0;
> /* and fail all 'written' */
> bi = sh->dev[i].written;
> sh->dev[i].written = NULL;
> @@ -3691,7 +3678,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> sh->dev[i].page = sh->dev[i].orig_page;
> }
>
> - if (bi) bitmap_end = 1;
> while (bi && bi->bi_iter.bi_sector <
> sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
> struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
> @@ -3725,9 +3711,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
> bi = nextbi;
> }
> }
> - if (bitmap_end)
> - conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf));
> /* If we were in the middle of a write the parity block might
> * still be locked - so just clear all R5_LOCKED flags
> */
> @@ -4076,8 +4059,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
> bio_endio(wbi);
> wbi = wbi2;
> }
> - conf->mddev->bitmap_ops->endwrite(conf->mddev,
> - sh->sector, RAID5_STRIPE_SECTORS(conf));
> +
> if (head_sh->batch_head) {
> sh = list_first_entry(&sh->batch_list,
> struct stripe_head,
> @@ -5797,10 +5779,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
> }
> spin_unlock_irq(&sh->stripe_lock);
> if (conf->mddev->bitmap) {
> - for (d = 0; d < conf->raid_disks - conf->max_degraded;
> - d++)
> - mddev->bitmap_ops->startwrite(mddev, sh->sector,
> - RAID5_STRIPE_SECTORS(conf));
> sh->bm_seq = conf->seq_flush + 1;
> set_bit(STRIPE_BIT_DELAY, &sh->state);
> }
For the patch itself, I'm good. I did a sequetial write performance
test, it indeed improve the performance about 20% with nvme devices.
Reviewed-by: Xiao Ni <xni@redhat.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write()
2025-01-09 1:51 [PATCH v2 md-6.14 0/5] " Yu Kuai
@ 2025-01-09 1:51 ` Yu Kuai
0 siblings, 0 replies; 14+ messages in thread
From: Yu Kuai @ 2025-01-09 1:51 UTC (permalink / raw)
To: song, xni, yukuai3; +Cc: linux-raid, linux-kernel, yukuai1, yi.zhang, yangerkun
From: Yu Kuai <yukuai3@huawei.com>
behind_write is only used in raid1, prepare to refactor
bitmap_{start/end}write(), there are no functional changes.
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
---
drivers/md/md-bitmap.c | 57 +++++++++++++++++++++++++---------------
drivers/md/md-bitmap.h | 7 +++--
drivers/md/raid1.c | 12 +++++----
drivers/md/raid10.c | 6 ++---
drivers/md/raid5-cache.c | 3 +--
drivers/md/raid5.c | 11 ++++----
6 files changed, 56 insertions(+), 40 deletions(-)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c3a42dd66ce5..84fb4cc67d5e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1671,24 +1671,13 @@ __acquires(bitmap->lock)
}
static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind)
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
- if (behind) {
- int bw;
- atomic_inc(&bitmap->behind_writes);
- bw = atomic_read(&bitmap->behind_writes);
- if (bw > bitmap->behind_writes_used)
- bitmap->behind_writes_used = bw;
-
- pr_debug("inc write-behind count %d/%lu\n",
- bw, bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
bitmap_counter_t *bmc;
@@ -1737,21 +1726,13 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
}
static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind)
+ unsigned long sectors, bool success)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
- if (behind) {
- if (atomic_dec_and_test(&bitmap->behind_writes))
- wake_up(&bitmap->behind_wait);
- pr_debug("dec write-behind count %d/%lu\n",
- atomic_read(&bitmap->behind_writes),
- bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
unsigned long flags;
@@ -2062,6 +2043,37 @@ static void md_bitmap_free(void *data)
kfree(bitmap);
}
+static void bitmap_start_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ int bw;
+
+ if (!bitmap)
+ return;
+
+ atomic_inc(&bitmap->behind_writes);
+ bw = atomic_read(&bitmap->behind_writes);
+ if (bw > bitmap->behind_writes_used)
+ bitmap->behind_writes_used = bw;
+
+ pr_debug("inc write-behind count %d/%lu\n",
+ bw, bitmap->mddev->bitmap_info.max_write_behind);
+}
+
+static void bitmap_end_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
+
+ if (atomic_dec_and_test(&bitmap->behind_writes))
+ wake_up(&bitmap->behind_wait);
+ pr_debug("dec write-behind count %d/%lu\n",
+ atomic_read(&bitmap->behind_writes),
+ bitmap->mddev->bitmap_info.max_write_behind);
+}
+
static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -2981,6 +2993,9 @@ static struct bitmap_operations bitmap_ops = {
.dirty_bits = bitmap_dirty_bits,
.unplug = bitmap_unplug,
.daemon_work = bitmap_daemon_work,
+
+ .start_behind_write = bitmap_start_behind_write,
+ .end_behind_write = bitmap_end_behind_write,
.wait_behind_writes = bitmap_wait_behind_writes,
.startwrite = bitmap_startwrite,
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 662e6fc141a7..e87a1f493d3c 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -84,12 +84,15 @@ struct bitmap_operations {
unsigned long e);
void (*unplug)(struct mddev *mddev, bool sync);
void (*daemon_work)(struct mddev *mddev);
+
+ void (*start_behind_write)(struct mddev *mddev);
+ void (*end_behind_write)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
int (*startwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind);
+ unsigned long sectors);
void (*endwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind);
+ unsigned long sectors, bool success);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 519c56f0ee3d..15ba7a001f30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -420,10 +420,11 @@ static void close_write(struct r1bio *r1_bio)
r1_bio->behind_master_bio = NULL;
}
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->end_behind_write(mddev);
/* clear the bitmap if all writes complete successfully */
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ !test_bit(R1BIO_Degraded, &r1_bio->state));
md_write_end(mddev);
}
@@ -1645,9 +1646,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
- mddev->bitmap_ops->startwrite(
- mddev, r1_bio->sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
+ mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
+ r1_bio->sectors);
first_clone = 0;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7d7a8a2524dc..c3a93b2a26a6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -430,8 +430,7 @@ static void close_write(struct r10bio *r10_bio)
/* clear the bitmap if all writes complete successfully */
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- false);
+ !test_bit(R10BIO_Degraded, &r10_bio->state));
md_write_end(mddev);
}
@@ -1519,8 +1518,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
- false);
+ mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b4f7b79fd187..4c7ecdd5c1f3 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -315,8 +315,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
+ !test_bit(STRIPE_DEGRADED, &sh->state));
}
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f09e7677ee9f..93cc7e252dd4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3564,7 +3564,7 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
+ RAID5_STRIPE_SECTORS(conf));
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@@ -3665,7 +3665,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
+ false);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3712,7 +3712,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bitmap_end)
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
+ false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4063,8 +4063,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
}
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
+ !test_bit(STRIPE_DEGRADED, &sh->state));
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -5787,7 +5786,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
for (d = 0; d < conf->raid_disks - conf->max_degraded;
d++)
mddev->bitmap_ops->startwrite(mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
+ RAID5_STRIPE_SECTORS(conf));
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
--
2.39.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
end of thread, other threads:[~2025-01-09 1:56 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-12-18 12:17 [PATCH v2 md-6.14 0/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 2/5] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite() yukuai
2024-12-23 7:31 ` Xiao Ni
2024-12-23 7:50 ` Yu Kuai
2024-12-30 5:01 ` Xiao Ni
2025-01-02 1:17 ` Yu Kuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 3/5] md: add a new callback pers->bitmap_sector() yukuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 4/5] md/raid5: implement pers->bitmap_sector() yukuai
2025-01-01 5:36 ` Xiao Ni
2025-01-02 1:21 ` Yu Kuai
2024-12-18 12:17 ` [PATCH v2 md-6.14 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer yukuai
2025-01-02 4:36 ` Xiao Ni
-- strict thread matches above, loose matches on Subject: below --
2025-01-09 1:51 [PATCH v2 md-6.14 0/5] " Yu Kuai
2025-01-09 1:51 ` [PATCH v2 md-6.14 1/5] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() Yu Kuai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox