* [PATCH 1/2] md bitmap bug fixes
@ 2005-03-09 22:18 Paul Clements
2005-03-09 22:19 ` [PATCH 2/2] " Paul Clements
2005-03-14 4:43 ` [PATCH 1/2] " Neil Brown
0 siblings, 2 replies; 71+ messages in thread
From: Paul Clements @ 2005-03-09 22:18 UTC (permalink / raw)
To: neilb; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 1035 bytes --]
Neil,
here are a couple of patches -- this one for the kernel, the next for
mdadm. They fix a few issues that I found while testing the new bitmap
intent logging code.
Briefly, the issues were:
kernel:
added call to bitmap_daemon_work() from raid1d so that the bitmap would
actually get cleared
fixed the marking of pages with BITMAP_CLEAN so that the bitmap would
get cleared correctly after resync and normal write I/O
pass back errors from write_page() since it now does actual writes itself
sync_size changed to sectors (was array_size which was KB) -- some
divisions by 2 were needed
mdadm:
avoid setting of sb->events_lo = 1 when creating a 0.90 superblock -- it
doesn't seem to be necessary and it was causing the event counters to
start at 4 billion+ (events_lo is actually the high part of the events
counter, on little endian machines anyway)
some sync_size changes, as in the kernel
if'ed out super1 definition which is now in the kernel headers
included sys/time.h to avoid compile error
Thanks,
Paul
[-- Attachment #2: md_bitmap_3_38_bug_fix.diff --]
[-- Type: text/plain, Size: 6800 bytes --]
Signed-Off-By: Paul Clements <paul.clements@steeleye.com>
bitmap.c | 58 +++++++++++++++++++++++++++++++++-------------------------
raid1.c | 1 +
2 files changed, 34 insertions(+), 25 deletions(-)
diff -purN --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all/drivers/md/bitmap.c linux-2.6.11-rc3-mm2-patch-all-bitmap-bug-fix/drivers/md/bitmap.c
--- linux-2.6.11-rc3-mm2-patch-all/drivers/md/bitmap.c Fri Feb 18 15:44:03 2005
+++ linux-2.6.11-rc3-mm2-patch-all-bitmap-bug-fix/drivers/md/bitmap.c Wed Mar 9 14:55:03 2005
@@ -265,6 +265,7 @@ static int write_page(struct page *page,
{
int ret = -ENOMEM;
+ PRINTK("bitmap write page %lu\n", page->index);
lock_page(page);
if (page->mapping == NULL)
@@ -350,8 +351,7 @@ int bitmap_update_sb(struct bitmap *bitm
if (!bitmap->mddev->degraded)
sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
kunmap(bitmap->sb_page);
- write_page(bitmap->sb_page, 0);
- return 0;
+ return write_page(bitmap->sb_page, 0);
}
/* print out the bitmap file superblock */
@@ -363,21 +363,22 @@ void bitmap_print_sb(struct bitmap *bitm
return;
sb = (bitmap_super_t *)kmap(bitmap->sb_page);
printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
- printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
- printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
- printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
+ printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
+ printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
+ printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
*(__u32 *)(sb->uuid+0),
*(__u32 *)(sb->uuid+4),
*(__u32 *)(sb->uuid+8),
*(__u32 *)(sb->uuid+12));
- printk(KERN_DEBUG " events: %llu\n",
+ printk(KERN_DEBUG " events: %llu\n",
(unsigned long long) le64_to_cpu(sb->events));
- printk(KERN_DEBUG "events_clred: %llu\n",
+ printk(KERN_DEBUG "events cleared: %llu\n",
(unsigned long long) le64_to_cpu(sb->events_cleared));
- printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
- printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
- printk(KERN_DEBUG "daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
- printk(KERN_DEBUG " sync size: %llu KB\n", le64_to_cpu(sb->sync_size));
+ printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
+ printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
+ printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+ printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)
+ le64_to_cpu(sb->sync_size) / 2);
kunmap(bitmap->sb_page);
}
@@ -734,7 +735,8 @@ int bitmap_unplug(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags);
if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE))
- write_page(page, 0);
+ if (write_page(page, 0) != 0)
+ return 1;
}
if (wait) { /* if any writes were performed, we need to wait on them */
spin_lock_irq(&bitmap->write_lock);
@@ -795,7 +797,7 @@ static int bitmap_init_from_disk(struct
bytes + sizeof(bitmap_super_t));
goto out;
}
- num_pages++;
+ // PRC: ???: num_pages++;
bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
if (!bitmap->filemap) {
ret = -ENOMEM;
@@ -953,14 +958,18 @@ int bitmap_daemon_work(struct bitmap *bi
bit = file_page_offset(j);
+
if (page != lastpage) {
+ PRINTK("bitmap clean at page %lu\n", j);
/* grab the new page, sync and release the old */
page_cache_get(page);
if (lastpage != NULL) {
if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
- write_page(lastpage, 0);
+ err = write_page(lastpage, 0);
+ /* we're done cleaning */
+ clear_page_attr(bitmap, lastpage, BITMAP_PAGE_CLEAN);
} else {
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -969,22 +978,21 @@ int bitmap_daemon_work(struct bitmap *bi
page_cache_release(lastpage);
if (err)
bitmap_file_kick(bitmap);
+ /* we're done cleaning this page */
+ clear_page_attr(bitmap, lastpage, BITMAP_PAGE_CLEAN);
} else
spin_unlock_irqrestore(&bitmap->lock, flags);
lastpage = page;
kmap(page);
-/*
- printk("bitmap clean at page %lu\n", j);
-*/
+
spin_lock_irqsave(&bitmap->lock, flags);
- clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
&blocks, 0);
if (bmc) {
-/*
- if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
-*/
+
+PRINTK("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
+
if (*bmc == 2) {
*bmc=1; /* maybe clear the bit next time */
set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
@@ -1005,10 +1013,12 @@ int bitmap_daemon_work(struct bitmap *bi
if (lastpage != NULL) {
kunmap(lastpage);
spin_lock_irqsave(&bitmap->lock, flags);
- if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
+ if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
- write_page(lastpage, 0);
+ err = write_page(lastpage, 0);
+ /* we're done cleaning and we've written the page out */
+ clear_page_attr(bitmap, lastpage, BITMAP_PAGE_CLEAN);
} else {
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1118,7 +1128,7 @@ static int bitmap_start_daemon(struct bi
md_wakeup_thread(daemon); /* start it running */
PRINTK("%s: %s daemon (pid %d) started...\n",
- bmname(bitmap), name, bitmap->daemon->tsk->pid);
+ bmname(bitmap), name, daemon->tsk->pid);
out_unlock:
spin_unlock_irqrestore(&bitmap->lock, flags);
return 0;
@@ -1381,7 +1404,8 @@ int bitmap_setallbits(struct bitmap *bit
spin_unlock_irqrestore(&bitmap->lock, flags);
memset(kmap(page), 0xff, PAGE_SIZE);
kunmap(page);
- write_page(page, 0);
+ if (write_page(page, 0) != 0)
+ return 1;
}
return 0;
diff -purN --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all/drivers/md/raid1.c linux-2.6.11-rc3-mm2-patch-all-bitmap-bug-fix/drivers/md/raid1.c
--- linux-2.6.11-rc3-mm2-patch-all/drivers/md/raid1.c Fri Feb 18 15:44:03 2005
+++ linux-2.6.11-rc3-mm2-patch-all-bitmap-bug-fix/drivers/md/raid1.c Mon Mar 7 15:25:52 2005
@@ -1001,6 +1003,7 @@ static void raid1d(mddev_t *mddev)
mdk_rdev_t *rdev;
md_check_recovery(mddev);
+ bitmap_daemon_work(mddev->bitmap);
for (;;) {
char b[BDEVNAME_SIZE];
^ permalink raw reply [flat|nested] 71+ messages in thread* [PATCH 2/2] md bitmap bug fixes 2005-03-09 22:18 [PATCH 1/2] md bitmap bug fixes Paul Clements @ 2005-03-09 22:19 ` Paul Clements 2005-03-14 4:43 ` [PATCH 1/2] " Neil Brown 1 sibling, 0 replies; 71+ messages in thread From: Paul Clements @ 2005-03-09 22:19 UTC (permalink / raw) To: neilb; +Cc: linux-raid [-- Attachment #1: Type: text/plain, Size: 1157 bytes --] Here's the mdadm patch... Paul Clements wrote: > Neil, > > here are a couple of patches -- this one for the kernel, the next for > mdadm. They fix a few issues that I found while testing the new bitmap > intent logging code. > > Briefly, the issues were: > > kernel: > > added call to bitmap_daemon_work() from raid1d so that the bitmap would > actually get cleared > > fixed the marking of pages with BITMAP_CLEAN so that the bitmap would > get cleared correctly after resync and normal write I/O > > pass back errors from write_page() since it now does actual writes itself > > sync_size changed to sectors (was array_size which was KB) -- some > divisions by 2 were needed > > mdadm: > > avoid setting of sb->events_lo = 1 when creating a 0.90 superblock -- it > doesn't seem to be necessary and it was causing the event counters to > start at 4 billion+ (events_lo is actually the high part of the events > counter, on little endian machines anyway) > > some sync_size changes, as in the kernel > > if'ed out super1 definition which is now in the kernel headers > > included sys/time.h to avoid compile error > > > Thanks, > Paul [-- Attachment #2: mdadm_2_0_devel_1_bitmap_bug_fix.diff --] [-- Type: text/plain, Size: 3960 bytes --] diff -purN --exclude makepkg --exclude rpm --exclude *.DIST --exclude md_u.h --exclude md_p.h --exclude bitmap.h --exclude mdadm.steeleye.spec --exclude-from /export/public/clemep/tmp/dontdiff mdadm-2.0-devel-1-PRISTINE/bitmap.c mdadm-2.0-devel-1-bitmap-bug-fix/bitmap.c --- mdadm-2.0-devel-1-PRISTINE/bitmap.c Sun Feb 13 22:00:00 2005 +++ mdadm-2.0-devel-1-bitmap-bug-fix/bitmap.c Mon Mar 7 12:15:38 2005 @@ -168,7 +168,8 @@ bitmap_info_t *bitmap_fd_read(int fd, in if (read_bits < total_bits) { /* file truncated... */ fprintf(stderr, Name ": WARNING: bitmap file is not large " - "enough for array size %llu!\n\n", info->sb.sync_size); + "enough for array size %lluKB (%llu/%llu)!\n\n", + info->sb.sync_size / 2, read_bits, total_bits); total_bits = read_bits; } out: @@ -226,13 +227,16 @@ int ExamineBitmap(char *filename, int br *(__u32 *)(sb->uuid+4), *(__u32 *)(sb->uuid+8), *(__u32 *)(sb->uuid+12)); - printf(" Events : %llu\n", sb->events); - printf(" Events Cleared : %llu\n", sb->events_cleared); + printf(" Events : %llu (%d.%llu)\n", sb->events, + (__u32)sb->events, sb->events >> 32); + printf(" Events Cleared : %llu (%d.%llu)\n", sb->events_cleared, + (__u32)sb->events_cleared, + sb->events_cleared >> 32); printf(" State : %s\n", bitmap_state(sb->state)); printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); printf(" Daemon : %ds flush period\n", sb->daemon_sleep); - printf(" Sync Size : %llu%s\n", sb->sync_size, - human_size(sb->sync_size * 1024)); + printf(" Sync Size : %lluKB%s\n", sb->sync_size / 2, + human_size(sb->sync_size * 512)); if (brief) goto free_info; printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", diff -purN --exclude makepkg --exclude rpm --exclude *.DIST --exclude md_u.h --exclude md_p.h --exclude bitmap.h --exclude mdadm.steeleye.spec --exclude-from /export/public/clemep/tmp/dontdiff mdadm-2.0-devel-1-PRISTINE/mdstat.c mdadm-2.0-devel-1-bitmap-bug-fix/mdstat.c --- mdadm-2.0-devel-1-PRISTINE/mdstat.c Tue Aug 10 21:28:50 2004 +++ mdadm-2.0-devel-1-bitmap-bug-fix/mdstat.c Mon Mar 7 11:09:29 2005 @@ -86,6 +86,7 @@ #include "mdadm.h" #include "dlink.h" #include <sys/select.h> +#include <sys/time.h> void free_mdstat(struct mdstat_ent *ms) { diff -purN --exclude makepkg --exclude rpm --exclude *.DIST --exclude md_u.h --exclude md_p.h --exclude bitmap.h --exclude mdadm.steeleye.spec --exclude-from /export/public/clemep/tmp/dontdiff mdadm-2.0-devel-1-PRISTINE/super0.c mdadm-2.0-devel-1-bitmap-bug-fix/super0.c --- mdadm-2.0-devel-1-PRISTINE/super0.c Sun Feb 13 21:59:45 2005 +++ mdadm-2.0-devel-1-bitmap-bug-fix/super0.c Mon Mar 7 13:27:38 2005 @@ -364,7 +364,8 @@ static int init_super0(void **sbp, mdu_a sb->failed_disks = info->failed_disks; sb->spare_disks = info->spare_disks; sb->events_hi = 0; - sb->events_lo = 1; + // PRC: why? sb->events_lo = 1; + sb->events_lo = 0; sb->layout = info->layout; sb->chunk_size = info->chunk_size; diff -purN --exclude makepkg --exclude rpm --exclude *.DIST --exclude md_u.h --exclude md_p.h --exclude bitmap.h --exclude mdadm.steeleye.spec --exclude-from /export/public/clemep/tmp/dontdiff mdadm-2.0-devel-1-PRISTINE/super1.c mdadm-2.0-devel-1-bitmap-bug-fix/super1.c --- mdadm-2.0-devel-1-PRISTINE/super1.c Sun Feb 13 22:00:44 2005 +++ mdadm-2.0-devel-1-bitmap-bug-fix/super1.c Mon Mar 7 11:34:16 2005 @@ -37,6 +37,7 @@ * total size: 256 bytes plus 2 per device. * 1K allows 384 devices. */ +#if 0 // already in kernel headers: struct mdp_superblock_1 { /* constant array information - 128 bytes */ __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ @@ -82,6 +83,8 @@ struct mdp_superblock_1 { */ __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ }; + +#endif #ifndef offsetof #define offsetof(t,f) ((int)&(((t*)0)->f)) ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-09 22:18 [PATCH 1/2] md bitmap bug fixes Paul Clements 2005-03-09 22:19 ` [PATCH 2/2] " Paul Clements @ 2005-03-14 4:43 ` Neil Brown 2005-03-14 9:44 ` Lars Marowsky-Bree ` (2 more replies) 1 sibling, 3 replies; 71+ messages in thread From: Neil Brown @ 2005-03-14 4:43 UTC (permalink / raw) To: Paul Clements; +Cc: linux-raid On Wednesday March 9, paul.clements@steeleye.com wrote: > Neil, > > here are a couple of patches -- this one for the kernel, the next for > mdadm. They fix a few issues that I found while testing the new bitmap > intent logging code. > > Briefly, the issues were: > > kernel: > > added call to bitmap_daemon_work() from raid1d so that the bitmap would > actually get cleared Yes... well... uhmm... how did I miss that?? I would actually rather call it from md_check_recovery (which is called from raid1d). I should rename md_check_recovery at some stage as it does more than it's name says. > > fixed the marking of pages with BITMAP_CLEAN so that the bitmap would > get cleared correctly after resync and normal write I/O I don't agree with this bit. The BITMAP_PAGE_CLEAN bit needs to be cleared before processing it rather than after as the current code does. However I see that this causes it to ignore all but the first bit of the page, so I have fixed that with: diff ./drivers/md/bitmap.c~current~ ./drivers/md/bitmap.c --- ./drivers/md/bitmap.c~current~ 2005-03-14 11:51:29.000000000 +1100 +++ ./drivers/md/bitmap.c 2005-03-14 12:54:02.000000000 +1100 @@ -940,6 +940,7 @@ int bitmap_daemon_work(struct bitmap *bi page = filemap_get_page(bitmap, j); /* skip this page unless it's marked as needing cleaning */ + if (page != lastpage) if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { if (attr & BITMAP_PAGE_NEEDWRITE) { page_cache_get(page); > > pass back errors from write_page() since it now does actual writes > itself Yep, thanks. > > sync_size changed to sectors (was array_size which was KB) -- some > divisions by 2 were needed Yes. > > mdadm: > > avoid setting of sb->events_lo = 1 when creating a 0.90 superblock -- it > doesn't seem to be necessary and it was causing the event counters to > start at 4 billion+ (events_lo is actually the high part of the events > counter, on little endian machines anyway) events_lo really should be the low part of the counter and it is for me.... something funny must be happening for you... > > some sync_size changes, as in the kernel Yep. > > if'ed out super1 definition which is now in the kernel headers I don't like this. I don't mdadm to include the kernel raid headers. I want it to use it's own. > > included sys/time.h to avoid compile error I wonder why I don't get an error.. What error do you get? NeilBrown ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-14 4:43 ` [PATCH 1/2] " Neil Brown @ 2005-03-14 9:44 ` Lars Marowsky-Bree 2005-03-14 10:22 ` Neil Brown 2005-03-15 4:24 ` [PATCH 1/2] md bitmap bug fixes Paul Clements 2005-03-17 20:51 ` [PATCH 0/3] md bitmap-based asynchronous writes Paul Clements 2 siblings, 1 reply; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-14 9:44 UTC (permalink / raw) To: Neil Brown, Paul Clements; +Cc: linux-raid On 2005-03-14T15:43:52, Neil Brown <neilb@cse.unsw.edu.au> wrote: Hi there, just a question about how the bitmap stuff works with 1++-redundancy, say RAID1 with 2 mirrors, or RAID6. One disk fails and is replaced/reattached, and resync begins. Now another disk fails and is replaced. Is the bitmap local to each disk? And in case of RAID1, with 4 disks (and two of them resyncing), could disk3 be rebuild from disk1 and disk4 from disk2 (as to optimize disk bandwidth)? Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-14 9:44 ` Lars Marowsky-Bree @ 2005-03-14 10:22 ` Neil Brown 2005-03-14 11:24 ` Lars Marowsky-Bree 0 siblings, 1 reply; 71+ messages in thread From: Neil Brown @ 2005-03-14 10:22 UTC (permalink / raw) To: Lars Marowsky-Bree; +Cc: Paul Clements, linux-raid On Monday March 14, lmb@suse.de wrote: > On 2005-03-14T15:43:52, Neil Brown <neilb@cse.unsw.edu.au> wrote: > > Hi there, just a question about how the bitmap stuff works with > 1++-redundancy, say RAID1 with 2 mirrors, or RAID6. I assume you mean RAID1 with 3 drives (there isn't really one main drive and all the others are mirrors - all drives are nearly equal). We haven't put any significant work into bitmap intent logging for levels other than raid1, so some of the answer may be pure theory. > > One disk fails and is replaced/reattached, and resync begins. Now > another disk fails and is replaced. Is the bitmap local to each > disk? Bitmap is for the whole array, not per-disk. If there are any failed drives, bits are not cleared from the bitmap. If a drive fails then any active resync is aborted and restarted (possibly this is not optimal...). If a failed disk is re-attached, then only the blocks changed since that the array was known-good are resynced. If a new drive is added, all blocks are synced. > > And in case of RAID1, with 4 disks (and two of them resyncing), could > disk3 be rebuild from disk1 and disk4 from disk2 (as to optimize disk > bandwidth)? If two are resyncing, they will resync in step with each other so there is no point in reading from disk2 as well as disk1. Just read from disk 1 and write to disks 3 and 4. Does that answer your questions? NeilBrown ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-14 10:22 ` Neil Brown @ 2005-03-14 11:24 ` Lars Marowsky-Bree 2005-03-14 22:54 ` Neil Brown 0 siblings, 1 reply; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-14 11:24 UTC (permalink / raw) To: Neil Brown; +Cc: Paul Clements, linux-raid On 2005-03-14T21:22:57, Neil Brown <neilb@cse.unsw.edu.au> wrote: > > Hi there, just a question about how the bitmap stuff works with > > 1++-redundancy, say RAID1 with 2 mirrors, or RAID6. > I assume you mean RAID1 with 3 drives (there isn't really one main > drive and all the others are mirrors - all drives are nearly equal). Yeah, that's what I meant. (BTW, if they are all equal, how to you figure out where to sync from? Isn't the "first" one also the first one to receive the writes, so unless it's somehow identified as bad, it's the one which will have the "best" data?) > We haven't put any significant work into bitmap intent logging for > levels other than raid1, so some of the answer may be pure theory. OK. (Though in particular for raid5 with the expensive parity and raid6 with the even more expensive parity this seems desireable.) > > One disk fails and is replaced/reattached, and resync begins. Now > > another disk fails and is replaced. Is the bitmap local to each > > disk? > Bitmap is for the whole array, not per-disk. > If there are any failed drives, bits are not cleared from the bitmap. > If a drive fails then any active resync is aborted and restarted > (possibly this is not optimal...). > > If a failed disk is re-attached, then only the blocks changed since > that the array was known-good are resynced. If a new drive is added, all > blocks are synced. I think each disk needs to have it's own bitmap in the long run. On start, we need to merge them. I'm thinking about network replication here, where it needs to be figured out which mirror has the 'good' data, but with a little bit of construction, the problem also arises for a single node: Consider that we crash and come back with just one side of the mirror, and then we crash again and come back with the other side. When both are restored, the bitmaps need to be merged so that we can create one coherent image from either/or. Admittedly, for a single node, this is ... uhm, well, pretty much constructed and a good time to get out the backup tapes ;-) > > And in case of RAID1, with 4 disks (and two of them resyncing), could > > disk3 be rebuild from disk1 and disk4 from disk2 (as to optimize disk > > bandwidth)? > If two are resyncing, they will resync in step with each other so > there is no point in reading from disk2 as well as disk1. Just read > from disk 1 and write to disks 3 and 4. Yes, I guess that makes perfect sense with a global bitmap. > Does that answer your questions? Yes, it helps me to better understand where we are right now... Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-14 11:24 ` Lars Marowsky-Bree @ 2005-03-14 22:54 ` Neil Brown 2005-03-18 10:33 ` Lars Marowsky-Bree 0 siblings, 1 reply; 71+ messages in thread From: Neil Brown @ 2005-03-14 22:54 UTC (permalink / raw) To: Lars Marowsky-Bree; +Cc: Paul Clements, linux-raid On Monday March 14, lmb@suse.de wrote: > On 2005-03-14T21:22:57, Neil Brown <neilb@cse.unsw.edu.au> wrote: > > > > Hi there, just a question about how the bitmap stuff works with > > > 1++-redundancy, say RAID1 with 2 mirrors, or RAID6. > > I assume you mean RAID1 with 3 drives (there isn't really one main > > drive and all the others are mirrors - all drives are nearly equal). > > Yeah, that's what I meant. > > (BTW, if they are all equal, how to you figure out where to sync > from? It arbitrarily chooses one. It doesn't matter which. The code currently happens to choose the first, but this is not a significant choice. > Isn't the "first" one also the first one to receive the writes, so > unless it's somehow identified as bad, it's the one which will have the > "best" data?) Data is written to all drives in parallel (the request to the first might be launched slightly before the second, but the difference is insignificant compared to the time it takes for the write to complete). There is no such thing as "best" data. Consider the situation where you want to make a transactional update to a file that requires writing two block. If the system dies while writing the first, the "before" data is better. If it dies while writing the second, the "after" data is better. > > > We haven't put any significant work into bitmap intent logging for > > levels other than raid1, so some of the answer may be pure theory. > > OK. > > (Though in particular for raid5 with the expensive parity and raid6 with > the even more expensive parity this seems desireable.) Yes. We will get there. We just aren't there yet so I cannot say with confidence how it will work. > > I think each disk needs to have it's own bitmap in the long run. On > start, we need to merge them. I think any scheme that involved multiple bitmaps would be introducing too much complexity. Certainly your examples sound very far fetched (as I think you admitted yourself). But I always try to be open to new ideas. NeilBrown ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-14 22:54 ` Neil Brown @ 2005-03-18 10:33 ` Lars Marowsky-Bree 2005-03-18 12:52 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-18 10:33 UTC (permalink / raw) To: Neil Brown; +Cc: Paul Clements, linux-raid On 2005-03-15T09:54:52, Neil Brown <neilb@cse.unsw.edu.au> wrote: > It arbitrarily chooses one. It doesn't matter which. The code > currently happens to choose the first, but this is not a significant choice. True enough. I had a typical case of tomatoes. Thanks. > > I think each disk needs to have it's own bitmap in the long run. On > > start, we need to merge them. > > I think any scheme that involved multiple bitmaps would be introducing > too much complexity. Certainly your examples sound very far fetched > (as I think you admitted yourself). But I always try to be open to > new ideas. For single node operations, yes. But disks appearing and reappearing is _mostly_ a cluster issue, and there it makes sense, because of the potentially diverging data in case both sides activate the mirrors. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 10:33 ` Lars Marowsky-Bree @ 2005-03-18 12:52 ` Peter T. Breuer 2005-03-18 13:42 ` Lars Marowsky-Bree 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-18 12:52 UTC (permalink / raw) To: linux-raid Lars Marowsky-Bree <lmb@suse.de> wrote: > On 2005-03-15T09:54:52, Neil Brown <neilb@cse.unsw.edu.au> wrote: > > I think any scheme that involved multiple bitmaps would be introducing > > too much complexity. Certainly your examples sound very far fetched > > (as I think you admitted yourself). But I always try to be open to > > new ideas. > > For single node operations, yes. But disks appearing and reappearing is > _mostly_ a cluster issue, and there it makes sense, because of the > potentially diverging data in case both sides activate the mirrors. Didn't we go through this once before, Lars :-). The bitmap is always pesimistic, so splitting it would only be aimed at making it less so, by dint of giving it a sharper focus on what's really what. The question is how much one gains and how much one loses. When I originally did FR1 0.1, I had multiple bitmaps, and one of the first things Neil suggested was combining them - and magically, all the code simplified away into nice little understandable trivialities. So I think it's a good idea from the maintenance point of view! What would one lose? Does it matter if we resync more than needed, when that is already 100 times less than we would have resynced without a bitmap? (proviso - I didn't read the post where you set out the error situations, but surely, on theoretical grounds, all that can happen is that the bitmap causes more to be synced than need be synced). Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 12:52 ` Peter T. Breuer @ 2005-03-18 13:42 ` Lars Marowsky-Bree 2005-03-18 14:50 ` Peter T. Breuer 2005-03-18 17:16 ` Luca Berra 0 siblings, 2 replies; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-18 13:42 UTC (permalink / raw) To: Peter T. Breuer, linux-raid On 2005-03-18T13:52:54, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > (proviso - I didn't read the post where you set out the error > situations, but surely, on theoretical grounds, all that can happen is > that the bitmap causes more to be synced than need be synced). You missed the point. The problem is for multi-nodes, both sides have their own bitmap. When a split scenario occurs, and both sides begin modifying the data, that bitmap needs to be merged before resync, or else we risk 'forgetting' that one side dirtied a block. This scenario _could_ occur for single nodes, but less likely so. That can either happen outside md (if one is careful in the wrappers around setting up network replication), or it could happen inside generic md (if each mirror had its own local bitmap). Which also has the advantage of providing inherent redundancy for the bitmap itself, BTW. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 13:42 ` Lars Marowsky-Bree @ 2005-03-18 14:50 ` Peter T. Breuer 2005-03-18 17:03 ` Paul Clements 2005-03-18 17:16 ` Luca Berra 1 sibling, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-18 14:50 UTC (permalink / raw) To: linux-raid Lars Marowsky-Bree <lmb@suse.de> wrote: > On 2005-03-18T13:52:54, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > > > (proviso - I didn't read the post where you set out the error > > situations, but surely, on theoretical grounds, all that can happen is > > that the bitmap causes more to be synced than need be synced). > > You missed the point. Probably! > The problem is for multi-nodes, both sides have their own bitmap. When a Wait - I'm unaware of what that should mean exactly. What level is your clustering done at? The above makes it sound as though it's kernel level, so that somehow the raid "device" exists on two nodes at once, and everything written anyahere gets written everywhere. Well, could it be less than kernel level? ... Yes, if we are talking about a single (distributed) application, and it writes to "two places at once on the net" every time it writes anything. Maybe a distributed database. 'K. > split scenario occurs, Here I think you mean that both nodes go their independent ways, due to somebody tripping over the network cables, or whatever. > and both sides begin modifying the data, that > bitmap needs to be merged before resync, or else we risk 'forgetting' > that one side dirtied a block. Hmm. I suppose your application is writing two places at once - it won't presumably get/pass back an ack until both are done. It sounds like it is implementing raid1 at application level itself. But I really am struggling to grasp the context! If you have a sort of networked device that covers several real physical devices on separate nodes, then how is raid involved? Surely each node won't have a raid device (and if it does, it's its own business)? Isn't the network device supposed to fix up what happens so that nobody is any the wiser? > This scenario _could_ occur for single nodes, but less likely so. > > That can either happen outside md (if one is careful in the wrappers > around setting up network replication), or it could happen inside > generic md (if each mirror had its own local bitmap). Could you set out the scenario very exactly, please, for those of us at the back of the class :-). I simply don't see it. I'm not saying it's not there to be seen, but that I have been unable to build a mental image of the situation from the description :(. > Which also has the advantage of providing inherent redundancy for the > bitmap itself, BTW. Now - in very general terms - if you are worrying about getting different bitmaps on different nodes, I'dd have to ask if it matters? Doesn't each node then get written to at wherever its bitmap indicates it still has to be written to? Is the problem in trying to decide where to read the data from? o( - is that it? That the problem is the problem of knowing who has the latest information, that we may read from it when doing what the various bitmaps say has to be done? Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 14:50 ` Peter T. Breuer @ 2005-03-18 17:03 ` Paul Clements 2005-03-18 18:43 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Paul Clements @ 2005-03-18 17:03 UTC (permalink / raw) To: linux-raid Peter T. Breuer wrote: > Lars Marowsky-Bree <lmb@suse.de> wrote: > >>On 2005-03-18T13:52:54, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: >>The problem is for multi-nodes, both sides have their own bitmap. When a >>split scenario occurs, > Here I think you mean that both nodes go their independent ways, due to > somebody tripping over the network cables, or whatever. Yes, or just a failover, where the active system crashes or is taken down (while the bitmap is dirty). >>and both sides begin modifying the data, that >>bitmap needs to be merged before resync, or else we risk 'forgetting' >>that one side dirtied a block. Right. And eventually we'd like to have that capability (of recombining bitmaps). > Could you set out the scenario very exactly, please, for those of us at > the back of the class :-). I simply don't see it. I'm not saying it's > not there to be seen, but that I have been unable to build a mental > image of the situation from the description :(. Typically, in a cluster environment, you set up a raid1 with a local disk and an nbd (or one of its variants) below it: [raid1] / \ [disk] [nbd] ---------> other system The situation he's talking about is, as you put it "somebody tripping over the network cables". In that case, you'll end up with this: system A system B [raid1] [raid1] / \ / \ [disk] [XXX] [disk] [XXX] Where there's a degraded raid1 writing only to the local disk on each system (and a dirty bitmap on both sides). The solution is to combine the bitmaps and resync in one direction or the other. Otherwise, you've got to do a full resync... -- Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 17:03 ` Paul Clements @ 2005-03-18 18:43 ` Peter T. Breuer 2005-03-18 19:01 ` Mario Holbe 2005-03-18 19:43 ` Paul Clements 0 siblings, 2 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-18 18:43 UTC (permalink / raw) To: linux-raid Paul Clements <paul.clements@steeleye.com> wrote: > [ptb] > > Could you set out the scenario very exactly, please, for those of us at > > the back of the class :-). I simply don't see it. I'm not saying it's > > not there to be seen, but that I have been unable to build a mental > > image of the situation from the description :(. > > Typically, in a cluster environment, you set up a raid1 with a local > disk and an nbd (or one of its variants) below it: system A > > [raid1] > / \ > [disk] [nbd] ---------> other system Alright. That's just raid with one nbd device as well as a local device in the mirror. On failover from this node we will serve directly from the remote source instead. > The situation he's talking about is, as you put it "somebody tripping > over the network cables". > > In that case, you'll end up with this: > > system A system B > [raid1] [raid1] > / \ / \ > [disk] [XXX] [disk] [XXX] Well, that is not what I think you should end up with. You should end up (according to me) with the floating IP moving to the other system in degraded raid mode: system B [raid1] / \ disk missing and system A has died - that's what triggered the failover, usually. And I believe the initial situation was: system A system B [raid1] .--- nbd / \ | | [disk] [nbd]-' [disk] You are suggesting a failure mode in which A does not die, but B thinks it does, and takes the floating IP address. Well, sorry, that's tough, but the IP is where the IP address is no matter what A may believe. No writes will go to A. What seems to be the idea is that the failover mechanism has fouled up - well, that's not a concern of md. If the failover mechanism does that it's not right. The failover should tell A to shutdown (if it hasn't already) and tell B to start serving. Is the problem a race condition? One would want to hold off or even reject writes during the seconds of transition. > Where there's a degraded raid1 writing only to the local disk on each > system (and a dirty bitmap on both sides). This situation is explicitly disallowed by failover designs. The failover mechanism will direct the reconfiguration so that this does not happen. I don't even see exactly how it _can_ happen. I'm happy to consider it, but I don't see how it can arise, since failover mechanisms do exactly their thing in not permitting it. > The solution is to combine the bitmaps and resync in one direction or > the other. Otherwise, you've got to do a full resync... I don't see that this solves anything. If you had both sides going at once, receiving different writes, then you are sc&**ed, and no resolution of bitmaps will help you, since both sides have received different (legitimate) data. It doesn't seem relevant to me to consider if they are equally up to date wrt the writes they have received. They will be in the wrong even if they are up to date. OK - maybe the problem is in the race between sending the writes across to system B, and shutting down A, and starting serving from B. This is the intended sequence: 1 A sends writes to B 2 A dies 3 failover blocks writes 4 failover moves IP address to B 5 B drops nbd server 6 B starts serving directly from a degraded raid, recording in bitmap 7 failover starts passing writes to B I can vaguely imagine some of the writes from (1) being still buffered in B for write to B somewhere about the (6) point. Is that a problem? I don't see that it is. The kernel will have them in its buffers. Applications will see them. What about when A comes back up? We then get a .--------------. system A | system B | nbd ---' [raid1] | | / \ | [disk] [disk] [nbd]-' situation, and a resync is done (skipping clean sectors). So I don't see where these "two" bitmaps are. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 18:43 ` Peter T. Breuer @ 2005-03-18 19:01 ` Mario Holbe 2005-03-18 19:33 ` Peter T. Breuer 2005-03-18 19:43 ` Paul Clements 1 sibling, 1 reply; 71+ messages in thread From: Mario Holbe @ 2005-03-18 19:01 UTC (permalink / raw) To: linux-raid Peter T. Breuer <ptb@lab.it.uc3m.es> wrote: > Paul Clements <paul.clements@steeleye.com> wrote: >> The solution is to combine the bitmaps and resync in one direction or >> the other. Otherwise, you've got to do a full resync... > I don't see that this solves anything. If you had both sides going at > once, receiving different writes, then you are sc&**ed, and no > resolution of bitmaps will help you, since both sides have received Paul's way will. > different (legitimate) data. It doesn't seem relevant to me to consider > if they are equally up to date wrt the writes they have received. They > will be in the wrong even if they are up to date. The goal is to have two equal mirrors. Of course, one has to decide which of the both mirrors has the "main" (the surviving) data. The simple (past) way is to just fail one of the mirrors and add it again. To achieve the same result with bitmaps, you have to combine both bitmaps: Since both systems did writes to their respective mirror, both mirrors differ in exactly all positions where one of the both systems did a write (and thus marked the bitmap). So, to get the both mirrors back in sync, you have to consider both bitmaps, i.e. combine them. regards Mario -- I heard, if you play a NT-CD backwards, you get satanic messages... That's nothing. If you play it forwards, it installs NT. ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 19:01 ` Mario Holbe @ 2005-03-18 19:33 ` Peter T. Breuer 2005-03-18 20:24 ` Mario Holbe 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-18 19:33 UTC (permalink / raw) To: linux-raid Mario Holbe <Mario.Holbe@tu-ilmenau.de> wrote: > Peter T. Breuer <ptb@lab.it.uc3m.es> wrote: > > different (legitimate) data. It doesn't seem relevant to me to consider > > if they are equally up to date wrt the writes they have received. They > > will be in the wrong even if they are up to date. > > The goal is to have two equal mirrors. Of course, one has to decide > which of the both mirrors has the "main" (the surviving) data. The > simple (past) way is to just fail one of the mirrors and add it > again. > To achieve the same result with bitmaps, you have to combine both > bitmaps: Since both systems did writes to their respective mirror, > both mirrors differ in exactly all positions where one of the both > systems did a write (and thus marked the bitmap). So, to get the both > mirrors back in sync, you have to consider both bitmaps, i.e. combine > them. Both have different data. If you have mapped all the writes each have ever done since they separated, all you know is (within) where they have written that different data. You don't know who has the right data. Yes, you can "sync" them by writing any one of the two mirrors to the other one, and need do so only on the union of the mapped data areas, but it won't help you get the right data (i.e. "yes"). The problem was in letting them both write to their respective disks in the first place. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 19:33 ` Peter T. Breuer @ 2005-03-18 20:24 ` Mario Holbe 2005-03-18 21:01 ` Andy Smith 2005-03-19 11:43 ` Peter T. Breuer 0 siblings, 2 replies; 71+ messages in thread From: Mario Holbe @ 2005-03-18 20:24 UTC (permalink / raw) To: linux-raid Peter T. Breuer <ptb@lab.it.uc3m.es> wrote: > Yes, you can "sync" them by writing any one of the two mirrors to the > other one, and need do so only on the union of the mapped data areas, As far as I understand the issue, this is exactly what should be possible. > but it won't help you get the right data (i.e. "yes"). There is no such thing like "the right data" from a block device's point of view. Both mirrors have "right data", since both got written independently. Thus, somebody has to choose one mirror being the "more right" one. This, of course, is in administrators hands. However, if somebody did so, exactly the sync you described above (union of the mapped data areas) must happen. regards Mario -- > As Luke Leighton said once on samba-ntdom, "now, what was that about > rebooting? that was so long ago, i had to look it up with man -k." ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 20:24 ` Mario Holbe @ 2005-03-18 21:01 ` Andy Smith 2005-03-19 11:43 ` Peter T. Breuer 1 sibling, 0 replies; 71+ messages in thread From: Andy Smith @ 2005-03-18 21:01 UTC (permalink / raw) To: linux-raid [-- Attachment #1: Type: text/plain, Size: 738 bytes --] On Fri, Mar 18, 2005 at 09:24:05PM +0100, Mario Holbe wrote: > There is no such thing like "the right data" from a block device's > point of view. Both mirrors have "right data", since both got written > independently. Thus, somebody has to choose one mirror being the > "more right" one. This, of course, is in administrators hands. > However, if somebody did so, exactly the sync you described above > (union of the mapped data areas) must happen. When would it ever be acceptable to throw away the "losing" node's data? Seems like in the majority of cases it would be better to focus on never getting into the position where two nodes in a cluster end up with inconsistent but legitimate data. Unless I am missing something here... [-- Attachment #2: Type: application/pgp-signature, Size: 187 bytes --] ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 20:24 ` Mario Holbe 2005-03-18 21:01 ` Andy Smith @ 2005-03-19 11:43 ` Peter T. Breuer 2005-03-19 12:58 ` Lars Marowsky-Bree 1 sibling, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 11:43 UTC (permalink / raw) To: linux-raid Mario Holbe <Mario.Holbe@tu-ilmenau.de> wrote: > Peter T. Breuer <ptb@lab.it.uc3m.es> wrote: > > Yes, you can "sync" them by writing any one of the two mirrors to the > > other one, and need do so only on the union of the mapped data areas, > > As far as I understand the issue, this is exactly what should be > possible. > > > but it won't help you get the right data (i.e. "yes"). > > There is no such thing like "the right data" from a block device's > point of view. Well, there is the "right data" from our point of view, and it is what should by on (one/both?) device by now. One doesn't get to recover that "right data" by copying one disk over another, however efficiently one does it. > Both mirrors have "right data", since both got written Well, neither do, more accurately! Not in toto, nor necesarily block by block. > independently. Thus, somebody has to choose one mirror being the > "more right" one. This, of course, is in administrators hands. But neither mirror is necessarily right. We are already in a bad situation. There is no good way out. You can merely choose which of the two data possibilities you want for each block. They're not necesarily either of them "right", but one of them may be, but which one we don't know. > However, if somebody did so, exactly the sync you described above > (union of the mapped data areas) must happen. Why should one think that copying all of one disk to the other (morally) gets one data that is more right than copying some of it? Nothing one can do at this point will help. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 11:43 ` Peter T. Breuer @ 2005-03-19 12:58 ` Lars Marowsky-Bree 2005-03-19 13:27 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-19 12:58 UTC (permalink / raw) To: Peter T. Breuer, linux-raid On 2005-03-19T12:43:41, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > Well, there is the "right data" from our point of view, and it is what > should by on (one/both?) device by now. One doesn't get to recover that > "right data" by copying one disk over another, however efficiently one > does it. It's about conflict resolution and recovery after a split-brain and concurrent service activation has occured. Read up on that here: http://www.linux-mag.com/2003-11/availability_01.html (see the blob about split-brain with drbd). It all depends on the kind of guarantees you need. > But neither mirror is necessarily right. We are already in a bad > situation. There is no good way out. You can merely choose which of > the two data possibilities you want for each block. They're not > necesarily either of them "right", but one of them may be, but which one > we don't know. It's quite clear that you won't get a consistent state of the system by mixing blocks from either side; you need to declare one the 'winner', throwing out the modifications on the other side (probably after having them saved manually, and then re-entering them later). For some scenarios, this is acceptable. > Why should one think that copying all of one disk to the other (morally) > gets one data that is more right than copying some of it? Nothing one > can do at this point will help. It's not a moral problem. It is about regaining consistency. Which one of the datasets you choose you could either arbitate via some automatic mechanisms (drbd-0.8 has a couple) or let a human decide. The default with drbd-0.7 is that they will detect this situation has occured and refuse to start replication unless the admin intervenes and decides which side wins. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 12:58 ` Lars Marowsky-Bree @ 2005-03-19 13:27 ` Peter T. Breuer 2005-03-19 14:07 ` Lars Marowsky-Bree 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 13:27 UTC (permalink / raw) To: linux-raid Lars Marowsky-Bree <lmb@suse.de> wrote: > On 2005-03-19T12:43:41, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > > > Well, there is the "right data" from our point of view, and it is what > > should by on (one/both?) device by now. One doesn't get to recover that > > "right data" by copying one disk over another, however efficiently one > > does it. > > It's about conflict resolution and recovery after a split-brain and > concurrent service activation has occured. It surely doesn't matter what words one uses, Lars, the semantics does not change? If you have different stuff in different places, then copying one over the other is only one way of "resolving the conflict", and resolve it it will, but help it won't necessarily. Why should the kind of copy you propose be better than another kind of copy? > Read up on that here: > http://www.linux-mag.com/2003-11/availability_01.html (see the blob > about split-brain with drbd). I didn't see anything that looked relevant :(. Sure that's the right reference? It's a pretty document but I didn't see any detail. As mentioned earlier, DRBD is a disk replication package that makes sure every block written on the primary disk gets copied to the secondary disk. From DRBD's perspective, it simply mirrors data from one machine to another, and switches which machine is primary on command. From Heartbeat's perspective, DRBD is just another resource (called datadisk) that Heartbeat directs to start or stop (become pri ... Clicking on the glyph with a box in it with the word "DRBD" in (figure two?) just gets a bigger image of the figure. > It all depends on the kind of guarantees you need. Indeed - and I haven't read any! If you want the disks to be self-consistent, you can just do "no copying" :-). But in any case I haven't seen anyone explain how the disks can get into a state where both sides have written to them ... OK - this is my best guess from the evidence so far .. you left a journal behind on system A when it crashed, and you accidentally brought up its FS before starting to sync it from B. So you accidentally got A written to some MORE before the resync started, so you need to write some MORE than would normally be necessary to undo the nasties. Well, "Don't Do That Then" (tm). Don't bring up the FS on A before starting the resync from B. Do make sure to always write the whole journal from B across to A in a resync. Or don't use a journal (tm :-). Another aproach is to have the journal on the mirror. Crazy as it sounds (for i/o especially), this means that B will have a "more evolved" form of the journal than A, and copying B to A will _always_ be right, in that it will correct the journal on A and bring it up to date with the journal on B. No extra mapping required, I believe (not having had my morning tequila). > > But neither mirror is necessarily right. We are already in a bad > > situation. There is no good way out. You can merely choose which of > > the two data possibilities you want for each block. They're not > > necesarily either of them "right", but one of them may be, but which one > > we don't know. > > It's quite clear that you won't get a consistent state of the system by > mixing blocks from either side; you need to declare one the 'winner', > throwing out the modifications on the other side (probably after having > them saved manually, and then re-entering them later). For some > scenarios, this is acceptable. OK - I agree. But one can do better, if the problem is what I guessed at above (journal left behind that does its replay too late and when it's not wanted). Moreover, I really do not agree that one should ever be in this situation. Having got in it, yes, you can choose a winning side and copy it. > > Why should one think that copying all of one disk to the other (morally) > > gets one data that is more right than copying some of it? Nothing one > > can do at this point will help. > > It's not a moral problem. It is about regaining consistency. Well, morality is about what it is good to do. I agree that you get a consistent result this way. > Which one of the datasets you choose you could either arbitate via some > automatic mechanisms (drbd-0.8 has a couple) or let a human decide. But how on earth can you get into this situation? It still is not clear to me, and it seems to me that there is a horrible flaw in the managing algorithm for the failover if it can happen, and one should fix it. > The default with drbd-0.7 is that they will detect this situation has > occured and refuse to start replication unless the admin intervenes and > decides which side wins. Hmm. I don't believe it can detect it reliably. It is always possible for both sides to have written different data in the ame places, etc. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 13:27 ` Peter T. Breuer @ 2005-03-19 14:07 ` Lars Marowsky-Bree 2005-03-19 15:06 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-19 14:07 UTC (permalink / raw) To: Peter T. Breuer, linux-raid On 2005-03-19T14:27:45, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > > Which one of the datasets you choose you could either arbitate via some > > automatic mechanisms (drbd-0.8 has a couple) or let a human decide. > But how on earth can you get into this situation? It still is not clear > to me, and it seems to me that there is a horrible flaw in the managing > algorithm for the failover if it can happen, and one should fix it. You mean, like an admin screwup which should never happen? ;-) Remember what RAID is about: About errors which _should not_ occur (if the world was perfect and software and hardware never failed); but which with a given probability they _do_ occur anyway, because the real world doesn't always do the right thing. It's futile to argue about that it should never occur; morale arguments don't change reality. Split-brain is a well studied subject, and while many prevention strategies exist, errors occur even in these algorithms; and there's always a trade-off: For some scenarios, they might choose a very low probability of split-brain occuring in exchange for a higher guarantee that service will 'always' be provided. It all depends on the kind of data and service, the requirements and the cost associated with it. > > The default with drbd-0.7 is that they will detect this situation has > > occured and refuse to start replication unless the admin intervenes and > > decides which side wins. > Hmm. I don't believe it can detect it reliably. It is always possible > for both sides to have written different data in the ame places, etc. drbd can detect this reliably by its generation counters; the one element which matters here is the one which tracks if the device has been promoted to primary while being disconnected. (Each side keeps its own generation counters and it's own bitmap & journal, and during regular operation, they are all sync'ed. So they can be used to figure out what diverged 'easily' enough.) If you don't believe something, why don't you go read up ;-) This also is a reasonably well studied subject; there's bits in "Fault Tolerance in Distributed Systems" by Jalote, and Philipp Reisner also has a paper on it online; I think parts of it are also covered by his thesis. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 14:07 ` Lars Marowsky-Bree @ 2005-03-19 15:06 ` Peter T. Breuer 2005-03-19 15:24 ` Mario Holbe 2005-03-19 16:24 ` Lars Marowsky-Bree 0 siblings, 2 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 15:06 UTC (permalink / raw) To: linux-raid Lars Marowsky-Bree <lmb@suse.de> wrote: > On 2005-03-19T14:27:45, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > > > > Which one of the datasets you choose you could either arbitate via some > > > automatic mechanisms (drbd-0.8 has a couple) or let a human decide. > > But how on earth can you get into this situation? It still is not clear > > to me, and it seems to me that there is a horrible flaw in the managing > > algorithm for the failover if it can happen, and one should fix it. > > You mean, like an admin screwup which should never happen? ;-) The admin would have had to do it deliberately, while preventing the normal failover from occurring. Mind you, I agree that he CAN do it, and thus has quite a high Murphy-mediated liklihood of doing it. But an admin can do anything, so "shrug" .. he jumped in, let him climb out. > Remember what RAID is about: About errors which _should not_ occur (if > the world was perfect and software and hardware never failed); but which > with a given probability they _do_ occur anyway, because the real world > doesn't always do the right thing. The software here is managing hardware failures by imposing a strict procedure! Are you suggesting that it be also prepared to fix its own procedural failures? Why not simply "fix the software"? Yes, being prepared to act in a robust and sensible fashion under unexpected circumstances is always good, but I simply cannot countenance failover software that is designed to SAVE one from disaster also being envisaged as possibly failing in a way that obtains the situation that it is explicitly intended to avoid - namely causing writes to two disks at the same time. > It's futile to argue about that it should never occur; morale arguments > don't change reality. It does NOT occur within the design, and the design is there precisely to avoid it. Once it has occured, the failover design really has failed :(. OK, it's an interesting situation, and we would like to get out of it neatly if it's practible, but I don't see much sense worrying about it - it's like worrying about giving away too many fouls on half-way, when you are 10-0 down and they're still scoring. The problem is NOT this situation, but how you managed to get into it. > Split-brain is a well studied subject, and while many prevention > strategies exist, errors occur even in these algorithms; Show me how :-(. The algorithm is perfectly simple: block, flush one, stop one, start one, unblock. That was under admin control. If you failover via death, it's even simpler: one dies, start another. Then you get the problems on resync, but I'll happily give the "simple" recipe for that too! > and there's > always a trade-off: Yes. > For some scenarios, they might choose a very low > probability of split-brain occuring in exchange for a higher guarantee > that service will 'always' be provided. It all depends on the kind of > data and service, the requirements and the cost associated with it. Well, that I agree with. And I am in favour of catering for the dumb admin. But if he wants to write both sides, I don't see why we should stop him :). > > > The default with drbd-0.7 is that they will detect this situation has > > > occured and refuse to start replication unless the admin intervenes and > > > decides which side wins. > > Hmm. I don't believe it can detect it reliably. It is always possible > > for both sides to have written different data in the ame places, etc. > > drbd can detect this reliably by its generation counters; It doesn't matter what words are used - it can't. If you split the two systems and carry on writing to both, then both "generation counters" will increment in the same way, but you don't have to write the same data to both! > the one > element which matters here is the one which tracks if the device has > been promoted to primary while being disconnected. If both systems get "promoted to primary", they both get the same count. > (Each side keeps its own generation counters and it's own bitmap & > journal, and during regular operation, they are all sync'ed. So they can > be used to figure out what diverged 'easily' enough.) They can't (not in all circumstances - that's just logical). > If you don't believe something, why don't you go read up ;-) Because I am a theorist, so I don't need to read up. It would only either confuse me with irrelevant detail or annoy me for being wrong :). I can tell what can and cannot happen without having to experience it - that's the whole point of theory :-(. (well, you did ask). > This also is a reasonably well studied subject; there's bits in "Fault > Tolerance in Distributed Systems" by Jalote, and Philipp Reisner also > has a paper on it online; I think parts of it are also covered by his > thesis. Quite probably, but all the writings in the world can't change the semantics of the universe :(. Two systems disconnected from each other cannot reliably be told apart without consulting a third "observer" who has been experiencing their actions throughout. You'd have to have them logging to a third node to figure out which is "right" (and which is "left" :-). Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 15:06 ` Peter T. Breuer @ 2005-03-19 15:24 ` Mario Holbe 2005-03-19 15:58 ` Peter T. Breuer 2005-03-19 16:24 ` Lars Marowsky-Bree 1 sibling, 1 reply; 71+ messages in thread From: Mario Holbe @ 2005-03-19 15:24 UTC (permalink / raw) To: linux-raid Peter T. Breuer <ptb@lab.it.uc3m.es> wrote: > Lars Marowsky-Bree <lmb@suse.de> wrote: >> Split-brain is a well studied subject, and while many prevention >> strategies exist, errors occur even in these algorithms; > Show me how :-(. The algorithm is perfectly simple: block, flush one, > stop one, start one, unblock. Sorry, but this is perfectly nonsense. As long as you have no exactly-once semantics in distributed environments, your simple algorithm is simply able to fail. regards Mario -- () Ascii Ribbon Campaign /\ Support plain text e-mail ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 15:24 ` Mario Holbe @ 2005-03-19 15:58 ` Peter T. Breuer 0 siblings, 0 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 15:58 UTC (permalink / raw) To: linux-raid Mario Holbe <Mario.Holbe@tu-ilmenau.de> wrote: > Peter T. Breuer <ptb@lab.it.uc3m.es> wrote: > > Lars Marowsky-Bree <lmb@suse.de> wrote: > >> Split-brain is a well studied subject, and while many prevention > >> strategies exist, errors occur even in these algorithms; > > Show me how :-(. The algorithm is perfectly simple: block, flush one, > > stop one, start one, unblock. > > Sorry, but this is perfectly nonsense. > As long as you have no exactly-once semantics in distributed > environments, your simple algorithm is simply able to fail. You DO have "exactly once" semantics. Writes go to the ONE place that is live. Notice that in my algorithm there are no two live things at once. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 15:06 ` Peter T. Breuer 2005-03-19 15:24 ` Mario Holbe @ 2005-03-19 16:24 ` Lars Marowsky-Bree 2005-03-19 17:19 ` Peter T. Breuer 2005-03-19 17:44 ` Guy 1 sibling, 2 replies; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-19 16:24 UTC (permalink / raw) To: Peter T. Breuer, linux-raid On 2005-03-19T16:06:29, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: I'm cutting out those parts of the discussion which are irrelevant (or which I don't consider worth pursuing; maybe you'll find someone else to explain with more patience). > > > Hmm. I don't believe it can detect it reliably. It is always possible > > > for both sides to have written different data in the ame places, etc. > > > > drbd can detect this reliably by its generation counters; > > It doesn't matter what words are used - it can't. If you split the two > systems and carry on writing to both, then both "generation counters" > will increment in the same way, but you don't have to write the same > data to both! That flag gets said (as part of the generation counter tuple) on both sides when that happens, and if it is set on both sides, they know something went wrong. And then they can consult their bitmaps to figure out the set of blocks which diverge. (Granted, it's a superset, but typically it'll be a much smaller set than the entire volume.) That part is really simple in theory. > I can tell what can and cannot happen without having to experience it - > that's the whole point of theory :-(. (well, you did ask). Your theory is flawed. > Quite probably, but all the writings in the world can't change the > semantics of the universe :(. Two systems disconnected from each other > cannot reliably be told apart without consulting a third "observer" who > has been experiencing their actions throughout. You'd have to have them > logging to a third node to figure out which is "right" (and which is > "left" :-). Wrong model. They know that at one point in time they've been in sync, and that they have since diverged, and so they can figure out where those differences occur. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 16:24 ` Lars Marowsky-Bree @ 2005-03-19 17:19 ` Peter T. Breuer 2005-03-19 17:36 ` Lars Marowsky-Bree 2005-03-19 17:44 ` Guy 1 sibling, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 17:19 UTC (permalink / raw) To: linux-raid Lars Marowsky-Bree <lmb@suse.de> wrote: > On 2005-03-19T16:06:29, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > > I'm cutting out those parts of the discussion which are irrelevant (or > which I don't consider worth pursuing; maybe you'll find someone else > to explain with more patience). Probably :-). > > It doesn't matter what words are used - it can't. If you split the two > > systems and carry on writing to both, then both "generation counters" > > will increment in the same way, but you don't have to write the same > > data to both! > > That flag gets said (as part of the generation counter tuple) on both > sides when that happens, and if it is set on both sides, they know > something went wrong. Fine, but ... > And then they can consult their bitmaps to figure out the set of blocks > which diverge. (Granted, it's a superset, but typically it'll be a much > smaller set than the entire volume.) It doesn't matter - Let them both have exactly the same bitmaps, but let different data have been written to each of them in those marked blocks. There's no way of telling which is "good" or "bad", then! It's just a difference of opinion - the universe bifurcated (it does it all the time :) and one disk went into one fork and the other disk into the other fork, and they happily did things for a while, then they got examined again (the universes joined). Hey presto! - two disks with perfectly valid but different histories. Now let their experiences while apart be the same from the point of view of the metadata, but different at the data level, and you have two disks that have no distinguishing characteristics at all that you can see or measure, but they're different. > That part is really simple in theory. Knowing which blocks are/may be different does not help them decide who is _right_. Lars, I am trying to move you "upwards" from the detail of the algorithms to a level at which you can see that there is no algorithm that can decide reliably which of two diverged traces is the more "valid" in some sense. Whatever your algorithm is, let it decide in a given case. Say it says "A" is more valid. It looks at certain characteristics of A and B's bitmaps and maybe other "generation" marks to decide. Fine. Now let A have B's present data at the time of divergence and every time data gets written to A in its diverged univese let it be written with B's present data. The bitmap and the marks will be the same as before! Now apply your decision algorithm. It chooses A again (it has the same data to work on). Unfortunately, that's choosing something that looks just like B! So B was the "more valid"! > > I can tell what can and cannot happen without having to experience it - > > that's the whole point of theory :-(. (well, you did ask). > > Your theory is flawed. :(. Oooh, unfortunately not. > > Quite probably, but all the writings in the world can't change the > > semantics of the universe :(. Two systems disconnected from each other > > cannot reliably be told apart without consulting a third "observer" who > > has been experiencing their actions throughout. You'd have to have them > > logging to a third node to figure out which is "right" (and which is > > "left" :-). > > Wrong model. They know that at one point in time they've been in sync, > and that they have since diverged, and so they can figure out where > those differences occur. They can't. That's the point. See above rough hack at a proof. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 17:19 ` Peter T. Breuer @ 2005-03-19 17:36 ` Lars Marowsky-Bree 0 siblings, 0 replies; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-19 17:36 UTC (permalink / raw) To: Peter T. Breuer, linux-raid On 2005-03-19T18:19:44, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: > > That part is really simple in theory. > > Knowing which blocks are/may be different does not help them decide who is > _right_. > > Lars, I am trying to move you "upwards" from the detail of the > algorithms to a level at which you can see that there is no algorithm > that can decide reliably which of two diverged traces is the more > "valid" in some sense. As I've said, there's a variety of decision algorithms; that already implied there's no one answer, but always a trade-off. And sometimes, the admin might have to make the choice and manually add the changes made to one set to the other. > > Wrong model. They know that at one point in time they've been in sync, > > and that they have since diverged, and so they can figure out where > > those differences occur. > They can't. That's the point. See above rough hack at a proof. You're mixing this up. The mail I replied to said they can't figure out what happened; that they can. Is there a perfect conflict resolution algorithm which preserved all changed and merged them perfectly? Probably not; not at the block layer, in any case. I never claimed that; see above. Please, keep your argument straight. -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business ^ permalink raw reply [flat|nested] 71+ messages in thread
* RE: [PATCH 1/2] md bitmap bug fixes 2005-03-19 16:24 ` Lars Marowsky-Bree 2005-03-19 17:19 ` Peter T. Breuer @ 2005-03-19 17:44 ` Guy 2005-03-19 17:54 ` Lars Marowsky-Bree 2005-03-19 18:11 ` Peter T. Breuer 1 sibling, 2 replies; 71+ messages in thread From: Guy @ 2005-03-19 17:44 UTC (permalink / raw) To: 'Lars Marowsky-Bree', 'Peter T. Breuer', linux-raid You said: "Wrong model. They know that at one point in time they've been in sync, and that they have since diverged, and so they can figure out where those differences occur." I agree, but I don't think a block device can to a re-sync without corrupting both. How do you merge a superset at the block level? AND the 2 blocks and update both? :) I don't think a filesystem would like that. It would be real bad to re-sync if the filesystem is mounted! In the case of a split brain, I think one must be 100% voided, and a full re-sync must be done. Guy -----Original Message----- From: linux-raid-owner@vger.kernel.org [mailto:linux-raid-owner@vger.kernel.org] On Behalf Of Lars Marowsky-Bree Sent: Saturday, March 19, 2005 11:25 AM To: Peter T. Breuer; linux-raid@vger.kernel.org Subject: Re: [PATCH 1/2] md bitmap bug fixes On 2005-03-19T16:06:29, "Peter T. Breuer" <ptb@lab.it.uc3m.es> wrote: I'm cutting out those parts of the discussion which are irrelevant (or which I don't consider worth pursuing; maybe you'll find someone else to explain with more patience). > > > Hmm. I don't believe it can detect it reliably. It is always possible > > > for both sides to have written different data in the ame places, etc. > > > > drbd can detect this reliably by its generation counters; > > It doesn't matter what words are used - it can't. If you split the two > systems and carry on writing to both, then both "generation counters" > will increment in the same way, but you don't have to write the same > data to both! That flag gets said (as part of the generation counter tuple) on both sides when that happens, and if it is set on both sides, they know something went wrong. And then they can consult their bitmaps to figure out the set of blocks which diverge. (Granted, it's a superset, but typically it'll be a much smaller set than the entire volume.) That part is really simple in theory. > I can tell what can and cannot happen without having to experience it - > that's the whole point of theory :-(. (well, you did ask). Your theory is flawed. > Quite probably, but all the writings in the world can't change the > semantics of the universe :(. Two systems disconnected from each other > cannot reliably be told apart without consulting a third "observer" who > has been experiencing their actions throughout. You'd have to have them > logging to a third node to figure out which is "right" (and which is > "left" :-). Wrong model. They know that at one point in time they've been in sync, and that they have since diverged, and so they can figure out where those differences occur. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 17:44 ` Guy @ 2005-03-19 17:54 ` Lars Marowsky-Bree 2005-03-19 18:05 ` Guy 2005-03-19 20:29 ` berk walker 2005-03-19 18:11 ` Peter T. Breuer 1 sibling, 2 replies; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-19 17:54 UTC (permalink / raw) To: Guy, 'Peter T. Breuer', linux-raid On 2005-03-19T12:44:14, Guy <bugzilla@watkins-home.com> wrote: > In the case of a split brain, I think one must be 100% voided, and a full > re-sync must be done. Exactly. And that's where the bitmaps come into play; you can look at what is modified on each side, merge those sets, and copy all affected blocks from one side to the other; afterwards, you'll have a consistent block device again. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* RE: [PATCH 1/2] md bitmap bug fixes 2005-03-19 17:54 ` Lars Marowsky-Bree @ 2005-03-19 18:05 ` Guy 2005-03-19 20:29 ` berk walker 1 sibling, 0 replies; 71+ messages in thread From: Guy @ 2005-03-19 18:05 UTC (permalink / raw) To: 'Lars Marowsky-Bree', 'Peter T. Breuer', linux-raid Oh! I never read it like you just said. I have been reading it like copy in both directions based on both bitmaps! What you said below, seems reasonable. Guy -----Original Message----- From: Lars Marowsky-Bree [mailto:lmb@suse.de] Sent: Saturday, March 19, 2005 12:54 PM To: Guy; 'Peter T. Breuer'; linux-raid@vger.kernel.org Subject: Re: [PATCH 1/2] md bitmap bug fixes On 2005-03-19T12:44:14, Guy <bugzilla@watkins-home.com> wrote: > In the case of a split brain, I think one must be 100% voided, and a full > re-sync must be done. Exactly. And that's where the bitmaps come into play; you can look at what is modified on each side, merge those sets, and copy all affected blocks from one side to the other; afterwards, you'll have a consistent block device again. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 17:54 ` Lars Marowsky-Bree 2005-03-19 18:05 ` Guy @ 2005-03-19 20:29 ` berk walker 1 sibling, 0 replies; 71+ messages in thread From: berk walker @ 2005-03-19 20:29 UTC (permalink / raw) To: Lars Marowsky-Bree; +Cc: Guy, 'Peter T. Breuer', linux-raid Lars Marowsky-Bree wrote: >On 2005-03-19T12:44:14, Guy <bugzilla@watkins-home.com> wrote: > > > >>In the case of a split brain, I think one must be 100% voided, and a full >>re-sync must be done. >> >> > >Exactly. And that's where the bitmaps come into play; you can look at >what is modified on each side, merge those sets, and copy all affected >blocks from one side to the other; afterwards, you'll have a consistent >block device again. > > >Sincerely, > Lars Marowsky-Brée <lmb@suse.de> > > > And if your whole farm is an Oracle database? - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 17:44 ` Guy 2005-03-19 17:54 ` Lars Marowsky-Bree @ 2005-03-19 18:11 ` Peter T. Breuer 1 sibling, 0 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 18:11 UTC (permalink / raw) To: linux-raid Guy <bugzilla@watkins-home.com> wrote: > I agree, but I don't think a block device can to a re-sync without > corrupting both. How do you merge a superset at the block level? AND the 2 Don't worry - it's just a one-way copy done efficiently (i.e., leaving out all the blocks known to be unmodified both sides). Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 18:43 ` Peter T. Breuer 2005-03-18 19:01 ` Mario Holbe @ 2005-03-18 19:43 ` Paul Clements 2005-03-19 12:10 ` Peter T. Breuer 1 sibling, 1 reply; 71+ messages in thread From: Paul Clements @ 2005-03-18 19:43 UTC (permalink / raw) To: linux-raid Peter T. Breuer wrote: > Paul Clements <paul.clements@steeleye.com> wrote: > I don't see that this solves anything. If you had both sides going at > once, receiving different writes, then you are sc&**ed, and no > resolution of bitmaps will help you, since both sides have received > different (legitimate) data. It doesn't seem relevant to me to consider You're forgetting that journalling filesystems and databases have to replay their journals or transaction logs when they start up. > What about when A comes back up? We then get a > > .--------------. > system A | system B | > nbd ---' [raid1] | > | / \ | > [disk] [disk] [nbd]-' > > situation, and a resync is done (skipping clean sectors). You're forgetting that there may be some data (uncommitted data) that didn't reach B that is on A's disk (or even vice versa). That is why you've got to retrieve the bitmap that was in use on A and combine it with B's bitmap before you resync from B to A (or do a full resync). -- Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 19:43 ` Paul Clements @ 2005-03-19 12:10 ` Peter T. Breuer 2005-03-21 16:07 ` Paul Clements 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 12:10 UTC (permalink / raw) To: linux-raid Paul Clements <paul.clements@steeleye.com> wrote: > Peter T. Breuer wrote: > > I don't see that this solves anything. If you had both sides going at > > once, receiving different writes, then you are sc&**ed, and no > > resolution of bitmaps will help you, since both sides have received > > different (legitimate) data. It doesn't seem relevant to me to consider > > You're forgetting that journalling filesystems and databases have to > replay their journals or transaction logs when they start up. Where are the journals located? Offhand I don't see that it makes a difference _how_ the data gets to the disks (i.e., via journal or not via journal) but it may do - I reserve judgement :-) -, and it may certainly affect the timings. Can you also pin this down for me in the same excellent way you did with the diagrams of the failover situation? > > What about when A comes back up? We then get a > > > > .--------------. > > system A | system B | > > nbd ---' [raid1] | > > | / \ | > > [disk] [disk] [nbd]-' > > > > situation, and a resync is done (skipping clean sectors). > > You're forgetting that there may be some data (uncommitted data) that > didn't reach B that is on A's disk (or even vice versa). You are saying that the journal on A (presumably not raided itself?) is waiting to play some data into its own disk as soon as we have finished resyncing it from B? I don't think that would be a good idea at all. I'm just not clear on what the setup is, but in the abstract I can't see that having a data journal is at all good - having a metadata journal is probably helpful, until the time that we remove a file on one FS and add it on another, and get to wondering which of the two ops to roll forward .. > That is why > you've got to retrieve the bitmap that was in use on A and combine it > with B's bitmap before you resync from B to A (or do a full resync). The logic still eludes me. This operation finds the set of blocks that _may be different_ atthis time between the two disks. THat enables one to efficiently copy A to B (or v.v.) because we know we only have to write the blocks marked. But whether that is a good idea or not is an orthogonal question, and to me it doesn't look necesarily better than some of the alternatives (doing nothing, for example). What makes it a good idea? Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 12:10 ` Peter T. Breuer @ 2005-03-21 16:07 ` Paul Clements 2005-03-21 18:56 ` Luca Berra 0 siblings, 1 reply; 71+ messages in thread From: Paul Clements @ 2005-03-21 16:07 UTC (permalink / raw) To: Peter T. Breuer; +Cc: linux-raid Peter T. Breuer wrote: > Paul Clements <paul.clements@steeleye.com> wrote: > >>Peter T. Breuer wrote: >> >>>I don't see that this solves anything. If you had both sides going at >>>once, receiving different writes, then you are sc&**ed, and no >>>resolution of bitmaps will help you, since both sides have received >>>different (legitimate) data. It doesn't seem relevant to me to consider >> >>You're forgetting that journalling filesystems and databases have to >>replay their journals or transaction logs when they start up. All I'm saying is that in a split-brain scenario, typical cluster frameworks will make two (or more) systems active at the same time. This is not necessarily fatal, because as you pointed out, if only one of those systems (let's call it system A) is really available to the outside world then you can usually simply trust the data on A and use it to sync over the other copies. But, if system B brought up a database or journalling FS on its copy of the data, then there were writes to that disk that have to be synced over. You can't simply use the bitmap on system A; you have to combine them (or else do a full resync). >>>What about when A comes back up? We then get a >>> >>> .--------------. >>> system A | system B | >>> nbd ---' [raid1] | >>> | / \ | >>> [disk] [disk] [nbd]-' >>> >>>situation, and a resync is done (skipping clean sectors). >> >>You're forgetting that there may be some data (uncommitted data) that >>didn't reach B that is on A's disk (or even vice versa). > > > You are saying that the journal on A (presumably not raided itself?) is > waiting to play some data into its own disk as soon as we have finished > resyncing it from B? I don't think that would be a good idea at all. No, I'm simply saying this: when you fail over from system A to system B (say you lost the network or system A died), there is some amount of data that could be out of sync (because raid1 submits writes to all disks simultaneously). When you take over using the data on system B, you're presumably going to want to (at some point) get A back to a state where it has the latest data (in case B later fails or in case A is a better system and you want to make it active, instead of B). To do that, you can't simply take the bitmap from B and sync back to A. You've got to look at the old bitmap on A and combine it with B's bitmap (or you've got to do a full resync). Until you've done that, the data that is on A is worthless. ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 16:07 ` Paul Clements @ 2005-03-21 18:56 ` Luca Berra 2005-03-21 19:58 ` Paul Clements 0 siblings, 1 reply; 71+ messages in thread From: Luca Berra @ 2005-03-21 18:56 UTC (permalink / raw) To: linux-raid On Mon, Mar 21, 2005 at 11:07:06AM -0500, Paul Clements wrote: >All I'm saying is that in a split-brain scenario, typical cluster >frameworks will make two (or more) systems active at the same time. This I sincerely hope not. -- Luca Berra -- bluca@comedia.it Communication Media & Services S.r.l. /"\ \ / ASCII RIBBON CAMPAIGN X AGAINST HTML MAIL / \ ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 18:56 ` Luca Berra @ 2005-03-21 19:58 ` Paul Clements 2005-03-21 20:45 ` Peter T. Breuer 2005-03-22 9:35 ` Luca Berra 0 siblings, 2 replies; 71+ messages in thread From: Paul Clements @ 2005-03-21 19:58 UTC (permalink / raw) Cc: linux-raid Luca Berra wrote: > On Mon, Mar 21, 2005 at 11:07:06AM -0500, Paul Clements wrote: > >> All I'm saying is that in a split-brain scenario, typical cluster >> frameworks will make two (or more) systems active at the same time. This > > I sincerely hope not. Perhaps my choice of wording was not the best? I probably should have said, "there is no foolproof way to guarantee that two systems are not active." Software fails, human beings make mistakes, and surely even STONITH devices can be misconfigured or can fail (or cannot be used for one reason or another). At any rate, this is all irrelevant given the second part of that email reply that I gave. You still have to do the bitmap combining, regardless of whether two systems were active at the same time or not. -- Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 19:58 ` Paul Clements @ 2005-03-21 20:45 ` Peter T. Breuer 2005-03-21 21:09 ` Gil ` (2 more replies) 2005-03-22 9:35 ` Luca Berra 1 sibling, 3 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-21 20:45 UTC (permalink / raw) To: linux-raid Paul Clements <paul.clements@steeleye.com> wrote: > At any rate, this is all irrelevant given the second part of that email > reply that I gave. You still have to do the bitmap combining, regardless > of whether two systems were active at the same time or not. As I understand it, you want both bitmaps in order to make sure that on resync you wipe over whatever may have been accidentally dirtied on the other side by a clumsy admin or vicious gremlin (various guises - software bug, journal effects, design incompetency, etc.). Writing everything across (and ignoring the bitmap) would do the trick, but given that we are being efficient and using bitmaps to lower the write totals at resync time, we need to use both bitmaps so as not to miss out on overwriting anything we should be overwriting. But why don't we already know from the _single_ bitmap on the array node ("the node with the array") what to rewrite in total? All writes must go through the array. We know how many didn't go to both components. Thus we know how many to rewrite from the survivor to the component that we lost contact with. Might some of the writes we don't think made it to the faulty component actually have made it? Sure. But so what? On resync we will rewrite them because we don't think they made it. Puzzling! Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 20:45 ` Peter T. Breuer @ 2005-03-21 21:09 ` Gil 2005-03-21 21:19 ` Paul Clements 2005-03-21 21:32 ` Guy 2 siblings, 0 replies; 71+ messages in thread From: Gil @ 2005-03-21 21:09 UTC (permalink / raw) To: Peter T. Breuer; +Cc: linux-raid Peter T. Breuer wrote: > Paul Clements <paul.clements@steeleye.com> wrote: > >> At any rate, this is all irrelevant given the second part of >> that email reply that I gave. You still have to do the bitmap >> combining, regardless of whether two systems were active at the >> same time or not. > > > As I understand it, you want both bitmaps in order to make sure > that on resync you wipe over whatever may have been accidentally > dirtied on the other side by a clumsy admin or vicious gremlin > (various guises - software bug, journal effects, design > incompetency, etc.). > > Writing everything across (and ignoring the bitmap) would do the > trick, but given that we are being efficient and using bitmaps to > lower the write totals at resync time, we need to use both > bitmaps so as not to miss out on overwriting anything we should > be overwriting. > > But why don't we already know from the _single_ bitmap on the > array node ("the node with the array") what to rewrite in total? > All writes must go through the array. We know how many didn't go > to both components. Thus we know how many to rewrite from the > survivor to the component that we lost contact with. Hi, I've been lurking on the list for a bit and am not as familiar with md internals as most but here goes: As I understand it, the scenario is that both systems can see themselves as the "node with the array" after a split. Both can go on and modify different parts of the array. To recover from the split, an administrator might decide which of the nodes is correct and then blow away the other node's data. We need the bitwise or of both bitmaps' dirty bits in order to know which blocks could possibly have changed while the array was split since both sides were changing independently. As you note, this mechanism just a more efficient way to do things than copying every block. --Gil ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 20:45 ` Peter T. Breuer 2005-03-21 21:09 ` Gil @ 2005-03-21 21:19 ` Paul Clements 2005-03-21 22:15 ` Peter T. Breuer 2005-03-22 22:35 ` Peter T. Breuer 2005-03-21 21:32 ` Guy 2 siblings, 2 replies; 71+ messages in thread From: Paul Clements @ 2005-03-21 21:19 UTC (permalink / raw) To: Peter T. Breuer; +Cc: linux-raid Peter T. Breuer wrote: > Paul Clements <paul.clements@steeleye.com> wrote: > >>At any rate, this is all irrelevant given the second part of that email >>reply that I gave. You still have to do the bitmap combining, regardless >>of whether two systems were active at the same time or not. > But why don't we already know from the _single_ bitmap on the array > node ("the node with the array") what to rewrite in total? All writes > must go through the array. We know how many didn't go to both > components. Thus we know how many to rewrite from the survivor to the > component that we lost contact with. I don't think we're talking about the same thing. I'm talking about this: 1) you have an array set up on system A: system A [raid1] / \ [disk] [nbd] --> system B 2) you're writing, say, block 10 to the raid1 when A crashes (block 10 is dirty in the bitmap, and you don't know whether it got written to the disk on A or B, neither, or both) 3) something (i.e., your cluster framework) notices that A is gone and brings up a new raid1, with an empty bitmap, on system B: system B [raid1] / \ [disk] missing (eventually will connect back to system A) 4) some things get written to the raid1 on system B (i.e., the bitmap is dirty) 5) system A comes back and we now want to get the two systems back in sync In this scenario, there are two bitmaps that must be consulted in order to sync the proper blocks back to system A. Without bitmaps (or the ability to combine the bitmaps), you must do a full resync from B to A. -- Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 21:19 ` Paul Clements @ 2005-03-21 22:15 ` Peter T. Breuer 2005-03-22 22:35 ` Peter T. Breuer 1 sibling, 0 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-21 22:15 UTC (permalink / raw) To: linux-raid Paul Clements <paul.clements@steeleye.com> wrote: OK - thanks for the reply, Paul ... > Peter T. Breuer wrote: > > But why don't we already know from the _single_ bitmap on the array > > node ("the node with the array") what to rewrite in total? All writes > > must go through the array. We know how many didn't go to both > > components. Thus we know how many to rewrite from the survivor to the > > component that we lost contact with. > > I don't think we're talking about the same thing. I'm talking about this: > > 1) you have an array set up on system A: > > system A > [raid1] > / \ > [disk] [nbd] --> system B > > 2) you're writing, say, block 10 to the raid1 when A crashes (block 10 > is dirty in the bitmap, and you don't know whether it got written to the > disk on A or B, neither, or both) But who are WE? Assuming "we" are some network application running on system A, either a) we got an ack from the raid and passed it on b) we got an ack from the raid but didn't pass it on c) we didn't get an ack from the raid and didn't pass it on In case (a), we wrote both A and B. In case (b), we wrote both A and B. In case (c), we wrote A or B, neither or both. Because bitmaps are cleared lazily, it's likely that one or both are dirty in all cases. > > 3) something (i.e., your cluster framework) notices that A is gone and > brings up a new raid1, with an empty bitmap, on system B: Well, OK. If you want to say that B is pure and unsullied by fiat, that's OK. You now start dirtying the bitmap for every write, so as to track them for when you later want to sync them to A. At this point clearly you may have received some writes that A did not (because it crashed), and you need to write those blocks back to A later too. And similarly A may have received some writes that B did not, and one may have to "undo" them later. So it looks like one should RETAIN the bitmap on B, not zero it, in order to "unwrite" blcks on A that were written, but we never got the writes for. > system B > [raid1] > / \ > [disk] missing (eventually will connect back to system A) > > 4) some things get written to the raid1 on system B (i.e., the bitmap is > dirty) > > 5) system A comes back and we now want to get the two systems back in sync > > In this scenario, there are two bitmaps that must be consulted in order > to sync the proper blocks back to system A. Without bitmaps (or the > ability to combine the bitmaps), you must do a full resync from B to A. Yes, but the analysis is not exact enough to show what should be done. For example, see the suggeston above that B shoudl NOT start with a empty bitmap, but should instead remember which blocks it never received (if any! or if it knows) in order that it can "unwrite" those blocks on A later, in case A did receive them. And I would say that when A receives its write (which would normally clear its bitmap) it should tell B, so that B only clears its bitmap when BOTH it and A have done their writes. And vice versa. Uh, that means we should store write-ids for a while so that we can communicate properly .. these can be request addresses, no? This strategy means that both A and B have pessimistic bitmaps for both of the pair. Either one will do as a resync map. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 21:19 ` Paul Clements 2005-03-21 22:15 ` Peter T. Breuer @ 2005-03-22 22:35 ` Peter T. Breuer 1 sibling, 0 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-22 22:35 UTC (permalink / raw) To: linux-raid Paul Clements <paul.clements@steeleye.com> wrote: > system A > [raid1] > / \ > [disk] [nbd] --> system B > > 2) you're writing, say, block 10 to the raid1 when A crashes (block 10 > is dirty in the bitmap, and you don't know whether it got written to the > disk on A or B, neither, or both) Let me offer an example based on this scenario. Block 10 is sent to both, and B's bitmap is dirtied for it, but the data itself never arrives. At the same time block 10 is sent to A, and the bitmap is dirtied for it, the data sent, and (miraculously) the bitmap on A is cleared for the received data (I don't now why or how - nobody has yet specified the algorithm with enough precision for me to say). At this point B's bitmap is dirty for block 10, and A's is not. A has received the data for block 10, and B has not. > 3) something (i.e., your cluster framework) notices that A is gone and > brings up a new raid1, with an empty bitmap, on system B: Now, this looks wrong, because to sync A from B we will later need to copy block 10 from B to A in order to "undo" the extra write already done on A, and A's bitmap is not marked dirty for block 10, only B's is, so we cannot zero B's bitmap because that would lose the information about block 10. -- I've been thinking about this in more general terms, and it seems to me that the algorithms offered (and I say I have not seen enough detail to be sure) may be in general "insufficiently pessimistic". That is, they may clear the bitmap too soon (as in the thought experiment above). Or they may not dirty the bitmaps soon enough. I believe that you are aiming for algorithms in which the _combined_ bitmaps are "sufficiently pessimistic", but the individual bitmaps are not necesarily so. But it appears to me as though it may not be much trouble to ensure that _each_ bitmap is sufficiently pessimistic on its own with respect to clearing. Just clear _each_ bitmap only when _both_ writes have been done. -- Can this plan fail to be pessimistic enough with respect to dirtying the bitmaps in the first place? What if block 10 is sent to A, which is to say the bitmap on A is dirtied, and the data sent, and received on A. Can B _not_ have its bitmap dirtied for block 10? Well, yes, if A dies before sending out the bitmap dirty to B, but after sending out the bitmap dirty AND the data to A. That's normally not possible. We normally surely send out all bitmap dirties before sending out any data. But can we wait for these to complete before starting on the data writes? If B times out, we will have to go ahead and dirty A's bitmap on its own and thereafter always dirty and never clear it. So this corresponds to A continuing to work after losing contact with B. Now, if A dies after that, and for some reason we start using B, then B will need eventually to have its block 10 sent to A when we resync A from B. But we never should have switched to B in the first place! B was expelled from the array. But A maybe died before saying so to anyone. Well, plainly A should not have gone on to write anything in the array after expelling B until it was able to write in its (A's) superblock that B had been expelled. Then, later, on recovery with a sync from B to A (even though it is the wrong direction), A will either say in its sb that B has not been expelled AND contain no extra writes t be undone from B, or A will say that B has been expelled, and its bitmap will say which writes have been done that were not done on B, and we can happily decide to sync from B, or sync from A. So it looks like there are indeed several admin foul-ups and crossed wires which could give us reason to sync in the rong direction, and then we will want to know what the recipient has in its bitmap. But we will be able to see that that is the situuation. In all other cases, it is sufficient to know just the bitmap on the master. The particular dubious situation outlined here is 1) A loses contact with B and continues working without B in the array, so B is out of date. 2) A dies, and B is recovered, becoming used as the master. 3) When A is recovered, we choose to sync A from B, not B from A. In that case we need to look at bitmaps both sides. But note that one bitmap per array (on the "local" side) would suffice in this case. The array node location shifts during the process outlined, givig two bitmaps to make use of eventually. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* RE: [PATCH 1/2] md bitmap bug fixes 2005-03-21 20:45 ` Peter T. Breuer 2005-03-21 21:09 ` Gil 2005-03-21 21:19 ` Paul Clements @ 2005-03-21 21:32 ` Guy 2 siblings, 0 replies; 71+ messages in thread From: Guy @ 2005-03-21 21:32 UTC (permalink / raw) To: 'Peter T. Breuer', linux-raid Say you have 2 systems, "A" and "B". "A" is active, a network failure occurs, node "A" can't talk to "B" or the clients. The remote member of the array is failed. But "A" is still running, and maybe in the process of shutting down. The process of shutting down "A" will cause more writes to the disk. Even without a proper shutdown, disk writes can occur, cron jobs, log files, whatever, all occurring before the shutdown, or power off. At some point node "B" becomes active, and processes live data. Now, someone fixes the problem and you want to re-sync. Both "A" and "B" have done disk I/O that the other does not know about. Both bitmap must be used to re-sync, or a 100% re-sync must be done. I think what I have outlined above is quite reasonable. Guy -----Original Message----- From: linux-raid-owner@vger.kernel.org [mailto:linux-raid-owner@vger.kernel.org] On Behalf Of Peter T. Breuer Sent: Monday, March 21, 2005 3:45 PM To: linux-raid@vger.kernel.org Subject: Re: [PATCH 1/2] md bitmap bug fixes Paul Clements <paul.clements@steeleye.com> wrote: > At any rate, this is all irrelevant given the second part of that email > reply that I gave. You still have to do the bitmap combining, regardless > of whether two systems were active at the same time or not. As I understand it, you want both bitmaps in order to make sure that on resync you wipe over whatever may have been accidentally dirtied on the other side by a clumsy admin or vicious gremlin (various guises - software bug, journal effects, design incompetency, etc.). Writing everything across (and ignoring the bitmap) would do the trick, but given that we are being efficient and using bitmaps to lower the write totals at resync time, we need to use both bitmaps so as not to miss out on overwriting anything we should be overwriting. But why don't we already know from the _single_ bitmap on the array node ("the node with the array") what to rewrite in total? All writes must go through the array. We know how many didn't go to both components. Thus we know how many to rewrite from the survivor to the component that we lost contact with. Might some of the writes we don't think made it to the faulty component actually have made it? Sure. But so what? On resync we will rewrite them because we don't think they made it. Puzzling! Peter - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-21 19:58 ` Paul Clements 2005-03-21 20:45 ` Peter T. Breuer @ 2005-03-22 9:35 ` Luca Berra 2005-03-22 10:02 ` Peter T. Breuer 1 sibling, 1 reply; 71+ messages in thread From: Luca Berra @ 2005-03-22 9:35 UTC (permalink / raw) To: linux-raid On Mon, Mar 21, 2005 at 02:58:56PM -0500, Paul Clements wrote: >Luca Berra wrote: >>On Mon, Mar 21, 2005 at 11:07:06AM -0500, Paul Clements wrote: >> >>>All I'm saying is that in a split-brain scenario, typical cluster >>>frameworks will make two (or more) systems active at the same time. This >> >>I sincerely hope not. > >Perhaps my choice of wording was not the best? I probably should have >said, "there is no foolproof way to guarantee that two systems are not >active." Software fails, human beings make mistakes, and surely even >STONITH devices can be misconfigured or can fail (or cannot be used for >one reason or another). well, careful use of an arbitrator node, possibly in a different location, helps avoiding split-brains, and stonith is a requirement >At any rate, this is all irrelevant given the second part of that email >reply that I gave. You still have to do the bitmap combining, regardless >of whether two systems were active at the same time or not. I still maintain that doing data-replication with md over nbd is a painful and not very useful exercise. If we want to do data-replication, access to the data-replicated device should be controlled by the data replication process (*), md does not guarantee this. (*) i.e. my requirements could be that having a replicated transaction is more important that completing the transaction itself, so i might want to return a disk error in case replica fails. or to the contrary i might want data availability above all else, maybe data does not change much. or something in between, data availability above replication, but data validity over availability. this is probably the most common scenario, and the more difficult to implement correctly. In any case it must be possible to control exactly which steps should be automatically done in case of failure. and in case of rollback, with the sane default would be "die rather than modify any data, in case of doubt". L. -- Luca Berra -- bluca@comedia.it Communication Media & Services S.r.l. /"\ \ / ASCII RIBBON CAMPAIGN X AGAINST HTML MAIL / \ ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-22 9:35 ` Luca Berra @ 2005-03-22 10:02 ` Peter T. Breuer 2005-03-23 20:31 ` Luca Berra 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-22 10:02 UTC (permalink / raw) To: linux-raid Luca Berra <bluca@comedia.it> wrote: > If we want to do data-replication, access to the data-replicated device > should be controlled by the data replication process (*), md does not > guarantee this. Well, if one writes to the md device, then md does guarantee this - but I find it hard to parse the statement. Can you elaborate a little in order to reduce my possible confusion? > (*) i.e. my requirements could be that having a replicated transaction > is more important that completing the transaction itself, so i might > want to return a disk error in case replica fails. Oh - I see. "We did half off all the replications possible". That's an interesting idea and it is trivial to modify md to return error if not all the replications succeeded. The bitmap knows right now. No reason not to call end_io(...,0) instead of end_io(...,1) if you want it that way. > or to the contrary i might want data availability above all else, maybe > data does not change much. > or something in between, data availability above replication, but > data validity over availability. this is probably the most common > scenario, and the more difficult to implement correctly. > > In any case it must be possible to control exactly which steps should be > automatically done in case of failure. and in case of rollback, with the > sane default would be "die rather than modify any data, in case of > doubt". Well, if you want to be more exact about it, I am sure your wishes can be accomodated. It's not a bad idea to be able to tailor the policy. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-22 10:02 ` Peter T. Breuer @ 2005-03-23 20:31 ` Luca Berra 2005-03-25 18:51 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Luca Berra @ 2005-03-23 20:31 UTC (permalink / raw) To: linux-raid On Tue, Mar 22, 2005 at 11:02:16AM +0100, Peter T. Breuer wrote: >Luca Berra <bluca@comedia.it> wrote: >> If we want to do data-replication, access to the data-replicated device >> should be controlled by the data replication process (*), md does not >> guarantee this. > >Well, if one writes to the md device, then md does guarantee this - but >I find it hard to parse the statement. Can you elaborate a little in >order to reduce my possible confusion? I'll try in fault tolerant architechture where we have two systems each with a local storage which is exposed to the other system via nbd or similar. One node is active and writes data to an md device composed from the local storage and the nbd device. The other node is stand-by and ready to take the place of the former in case it fails. I assume the data replication is synchronous at the moment (the write system call returns when io has been submitted to both the underlying devices) (*) we can have a series of failures which must be accounted for and dealt with according to a policy that might be site specific. A) Failure of the standby node A.1) the active is allowed to continue in the absence of a data replica A.2) disk writes from the active should return an error. we can configure this setting in advance. B) Failure of the active node B.1) the standby node takes immediately ownership of data and resumes processing B.2) the standby node remains idle C) communication failure between the two nodes (and we don't have an external mechanism to arbitrate the split brain condition) C.1) both system panic and halt C.2) A1 + B2 C.3) A2 + B2 C.4) A1 + B1 C.5) A2 + B1 (which hopefully will go to A2 itself) D) communication failure between the two nodes (admitting we have an external mechanism to arbitrate the split brain condition) D.1) A1 + B2 D.2) A2 + B2 D.2) B1 then A1 D.3) B1 then A2 E) rolling failure (C, then B) F) rolling failure (D, then B) G) a failed nodes is restored H) a node (re)starts while the other is failed I) a node (re)starts during C J) a node (re)starts during D K) a node (re)starts during E L) a node (re)starts during F scenarios without a sub-scenarios are left as an exercise to the reader, or i might find myself losing a job :) now evaluate all scenarios under the following drivers: 1) data availability above all others 2) replica of data above all others 3) data availability above replica, but data consistency above availability (*) if you got this far, add asynchronous replicas to the picture. Regards, Luca -- Luca Berra -- bluca@comedia.it Communication Media & Services S.r.l. /"\ \ / ASCII RIBBON CAMPAIGN X AGAINST HTML MAIL / \ ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-23 20:31 ` Luca Berra @ 2005-03-25 18:51 ` Peter T. Breuer 2005-03-25 20:54 ` berk walker 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-25 18:51 UTC (permalink / raw) To: linux-raid Luca Berra <bluca@comedia.it> wrote: > we can have a series of failures which must be accounted for and dealt > with according to a policy that might be site specific. > > A) Failure of the standby node > A.1) the active is allowed to continue in the absence of a data replica > A.2) disk writes from the active should return an error. > we can configure this setting in advance. OK. One normally wants RAID to provide continuity of service in real time however. Your choice A2 is not aimed at that, but at guaranteeing te existence of an exact copy of whatever is written. This seems to me only to have applications in accountancy :-). > B) Failure of the active node > B.1) the standby node takes immediately ownership of data and resumes > processing > B.2) the standby node remains idle Well, that's the same set of choices as for A, morally. You might as well pair them with A1 and A2. > C) communication failure between the two nodes (and we don't have an > external mechanism to arbitrate the split brain condition) > C.1) both system panic and halt > C.2) A1 + B2 I don't see the point of anything except A1+B1, A2+B2, as policies. But A1+B1 will normally cause divergence, unless the failure is due to actual isolation of, say, system A from the whole external net. Prvided the route between the two systems passes through the router that chooses whether to use A or B for external contacts, I don't see how a loss of contact can be anything but a breakdown of that router (but you could argue for a very whacky router). In which case it doesn't matter what you choose, because nothing will write to either. > C.3) A2 + B2 > C.4) A1 + B1 > C.5) A2 + B1 (which hopefully will go to A2 itself) > D) communication failure between the two nodes (admitting we have an > external mechanism to arbitrate the split brain condition) > D.1) A1 + B2 > D.2) A2 + B2 > D.2) B1 then A1 > D.3) B1 then A2 I would hope that we could at least guarantee that if comms fails between them, then it is because ONE (or more) of them is out of contact with the world. We can achieve that condition via routing. In that case either A1+B1 or A2+B2 would do, depending on your aims (continuity of service or data replication). > E) rolling failure (C, then B) > > F) rolling failure (D, then B) Not sure what these mean. > G) a failed nodes is restored > > H) a node (re)starts while the other is failed > > I) a node (re)starts during C > > J) a node (re)starts during D > > K) a node (re)starts during E > > L) a node (re)starts during F Ecch. Well, you are very thorough. This is well thought-through. > scenarios without a sub-scenarios are left as an exercise to the reader, > or i might find myself losing a job :) > > now evaluate all scenarios under the following drivers: > 1) data availability above all others > 2) replica of data above all others Exactly. I see only those as sensible aims. > 3) data availability above replica, but data consistency above > availability Heck! Well, that is very very thorough. > (*) if you got this far, add asynchronous replicas to the picture. I don't know what to say. In many of those situations we do not know what to do, but your analysis is excellent, and allos us to at least think about it. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-25 18:51 ` Peter T. Breuer @ 2005-03-25 20:54 ` berk walker 2005-03-25 20:56 ` berk walker 0 siblings, 1 reply; 71+ messages in thread From: berk walker @ 2005-03-25 20:54 UTC (permalink / raw) To: Peter T. Breuer; +Cc: linux-raid as I understand it, the topic is what happens when a RAID0 which is split twixt remote sites loses communication. What would you suggest as a method to keep the obvious from happening? A proactive approach, perhaps? Since, it seems, that write requests do not have to be ack'd immediatly, there *will* be a total sync loss. Are the write logs deep enough to be able to be used to resync, once the systems realize that they are no longer connected? From what I've read, the bitmaps will be no help in reconstruction, but just in error detection (correct me if needed). This is indeed an interesting topic, and one that strikes close to home. b- ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-25 20:54 ` berk walker @ 2005-03-25 20:56 ` berk walker 0 siblings, 0 replies; 71+ messages in thread From: berk walker @ 2005-03-25 20:56 UTC (permalink / raw) To: berk walker; +Cc: Peter T. Breuer, linux-raid berk walker wrote: > as I understand it, the topic is what happens when a RAID0 which is > split twixt remote sites loses communication. What would you suggest > as a method to keep the obvious from happening? > oops! RAID1.. sorry > A proactive approach, perhaps? Since, it seems, that write requests > do not have to be ack'd immediatly, there *will* be a total sync > loss. Are the write logs deep enough to be able to be used to resync, > once the systems realize that they are no longer connected? From what > I've read, the bitmaps will be no help in reconstruction, but just in > error detection (correct me if needed). > This is indeed an interesting topic, and one that strikes close to home. > b- > - > To unsubscribe from this list: send the line "unsubscribe linux-raid" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > . > ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 13:42 ` Lars Marowsky-Bree 2005-03-18 14:50 ` Peter T. Breuer @ 2005-03-18 17:16 ` Luca Berra 2005-03-18 17:57 ` Lars Marowsky-Bree 2005-03-18 21:46 ` Michael Tokarev 1 sibling, 2 replies; 71+ messages in thread From: Luca Berra @ 2005-03-18 17:16 UTC (permalink / raw) To: linux-raid On Fri, Mar 18, 2005 at 02:42:55PM +0100, Lars Marowsky-Bree wrote: >The problem is for multi-nodes, both sides have their own bitmap. When a >split scenario occurs, and both sides begin modifying the data, that >bitmap needs to be merged before resync, or else we risk 'forgetting' >that one side dirtied a block. on a side note i am wondering what would the difference be on using this approach within the md driver versus DRBD? L. -- Luca Berra -- bluca@comedia.it Communication Media & Services S.r.l. /"\ \ / ASCII RIBBON CAMPAIGN X AGAINST HTML MAIL / \ ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 17:16 ` Luca Berra @ 2005-03-18 17:57 ` Lars Marowsky-Bree 2005-03-18 21:46 ` Michael Tokarev 1 sibling, 0 replies; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-18 17:57 UTC (permalink / raw) To: linux-raid On 2005-03-18T18:16:08, Luca Berra <bluca@comedia.it> wrote: > >The problem is for multi-nodes, both sides have their own bitmap. When a > >split scenario occurs, and both sides begin modifying the data, that > >bitmap needs to be merged before resync, or else we risk 'forgetting' > >that one side dirtied a block. > on a side note i am wondering what would the difference be on using this > approach within the md driver versus DRBD? Functionally equivalent, I think. drbd's main advantage right now is that it comes in a neat single package and is, compared to getting the pieces combined right, much easier to setup. Having the speed-up resync in itself doesn't yet solve all issues: If you look at drbd's generation counter handling, the included merging of the bitmaps on connect, tight integration with the network stack et cetera. One might also argue that drbd's activity log (plus the bitmap) is slightly more sophisticated, but Philipp is the man to speak for that. Now, certainly one can build such a solution on top of the md fast resync (with maybe some features lifted from drbd if there are any nice ones you'd care for) + a network block device / iSCSI too, and wrap it all up so that it'd look just like drbd. And that wouldn't be too bad. Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 17:16 ` Luca Berra 2005-03-18 17:57 ` Lars Marowsky-Bree @ 2005-03-18 21:46 ` Michael Tokarev 2005-03-19 9:05 ` Lars Marowsky-Bree 2005-03-19 12:16 ` Peter T. Breuer 1 sibling, 2 replies; 71+ messages in thread From: Michael Tokarev @ 2005-03-18 21:46 UTC (permalink / raw) To: Luca Berra; +Cc: linux-raid Luca Berra wrote: > On Fri, Mar 18, 2005 at 02:42:55PM +0100, Lars Marowsky-Bree wrote: > >> The problem is for multi-nodes, both sides have their own bitmap. When a >> split scenario occurs, and both sides begin modifying the data, that >> bitmap needs to be merged before resync, or else we risk 'forgetting' >> that one side dirtied a block. > > on a side note i am wondering what would the difference be on using this > approach within the md driver versus DRBD? DRBD is more suitable for the task IMHO. Several points: o For md, all drives are equal, that is, for example, raid1 code will balance reads among all the available drives a-la striping, while DRBD knows one mirror is remote and hence will not try to read from it. Well, todays GigE is fast, but it is yet another layer between your data and the memory, and we also have such a thing as latency. o We all know how md "loves" to kick off "faulty" array components after first I/O error, don't we? DRBD allows "temporary" failures of remote component, and will recover automatically when the remote comes back. o DRBD allows local drive to be a bit ahead compared to remote one (configurable), while md will wait for all drives to complete a write. There's a case which is questionable in the first place: what to do if local part of the mirror fails? Md will happily run on single remote component in degraded mode, while DRBD will probably fail... Dunno which behaviour is "better" or "more correct" (depends on the usage scenario I think). But oh, I haven't looked at DRBD for about 5 years... /mjt ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 21:46 ` Michael Tokarev @ 2005-03-19 9:05 ` Lars Marowsky-Bree 2005-03-19 12:16 ` Peter T. Breuer 1 sibling, 0 replies; 71+ messages in thread From: Lars Marowsky-Bree @ 2005-03-19 9:05 UTC (permalink / raw) To: Michael Tokarev, Luca Berra; +Cc: linux-raid On 2005-03-19T00:46:51, Michael Tokarev <mjt@tls.msk.ru> wrote: > o DRBD allows local drive to be a bit ahead compared to remote one > (configurable), while md will wait for all drives to complete a write. The async writes have been added to md recently my Paul too. > There's a case which is questionable in the first place: what to > do if local part of the mirror fails? Md will happily run on > single remote component in degraded mode, while DRBD will probably > fail... drbd will continue correctly; it'll detach the local drive and run all IO over the network. Also, the direction of the sync has been decoupled from the primary/secondary status for a while, and the drbd-0.8 (development branch) is almost 90% there to allow both nodes IO access to the device. (I think; maybe it's also already done. It's been a while since I tried playing with that particular version.) Sincerely, Lars Marowsky-Brée <lmb@suse.de> -- High Availability & Clustering SUSE Labs, Research and Development SUSE LINUX Products GmbH - A Novell Business - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-18 21:46 ` Michael Tokarev 2005-03-19 9:05 ` Lars Marowsky-Bree @ 2005-03-19 12:16 ` Peter T. Breuer 2005-03-19 12:34 ` Michael Tokarev 1 sibling, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 12:16 UTC (permalink / raw) To: linux-raid Michael Tokarev <mjt@tls.msk.ru> wrote: > Luca Berra wrote: > > On Fri, Mar 18, 2005 at 02:42:55PM +0100, Lars Marowsky-Bree wrote: > > > >> The problem is for multi-nodes, both sides have their own bitmap. When a > >> split scenario occurs, and both sides begin modifying the data, that > >> bitmap needs to be merged before resync, or else we risk 'forgetting' > >> that one side dirtied a block. > > > > on a side note i am wondering what would the difference be on using this > > approach within the md driver versus DRBD? > > DRBD is more suitable for the task IMHO. Several points: > > o For md, all drives are equal, that is, for example, raid1 > code will balance reads among all the available drives a-la Not necessarily so. At least part of the FR1 patch is dedicated to timing the latencies of the disks, and choosing the fastest disk to read from. > striping, while DRBD knows one mirror is remote and hence > will not try to read from it. Well, todays GigE is fast, > but it is yet another layer between your data and the memory, > and we also have such a thing as latency. > > o We all know how md "loves" to kick off "faulty" array components > after first I/O error, don't we? DRBD allows "temporary" failures Again, the FR1 patch contains the "Robust Read" subpatch, which stops this happening. It's not a patch that intersects with the bitmap functionality at all, by the way. > of remote component, and will recover automatically when the > remote comes back. Well, so will FR1 (at least when run over ENBD, because FR1 contains a machanism that allows the disks to tell it when they have come back into their full gleam of health again). > o DRBD allows local drive to be a bit ahead compared to remote one > (configurable), while md will wait for all drives to complete a write. The FR1 patch allows asynchronous writes too. This does need the bitmap. I presume Paul's latest patches to raid1 for 2.6.11 also add that to kernel raid1. Incidentally, async writes are a little risky. We can more easily imagine a tricky recovery situation with them than without them! > There's a case which is questionable in the first place: what to > do if local part of the mirror fails? Md will happily run on > single remote component in degraded mode, while DRBD will probably > fail... Dunno which behaviour is "better" or "more correct" > (depends on the usage scenario I think). It's clearly correct to run on the remote if that's what you have. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 12:16 ` Peter T. Breuer @ 2005-03-19 12:34 ` Michael Tokarev 2005-03-19 12:53 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Michael Tokarev @ 2005-03-19 12:34 UTC (permalink / raw) To: Peter T. Breuer; +Cc: linux-raid Peter T. Breuer wrote: > Michael Tokarev <mjt@tls.msk.ru> wrote: [] >>o For md, all drives are equal, that is, for example, raid1 >> code will balance reads among all the available drives a-la > > Not necessarily so. At least part of the FR1 patch is dedicated to > timing the latencies of the disks, and choosing the fastest disk to > read from. [] >>o We all know how md "loves" to kick off "faulty" array components >> after first I/O error, don't we? DRBD allows "temporary" failures > > Again, the FR1 patch contains the "Robust Read" subpatch, which stops > this happening. It's not a patch that intersects with the bitmap > functionality at all, by the way. [] > > Well, so will FR1 (at least when run over ENBD, because FR1 contains > a machanism that allows the disks to tell it when they have come back > into their full gleam of health again). Ok, you intrigued me enouth already.. what's the FR1 patch? I want to give it a try... ;) Especially I'm interested in the "Robust Read" thing... /mjt ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-19 12:34 ` Michael Tokarev @ 2005-03-19 12:53 ` Peter T. Breuer 2005-03-19 16:08 ` "Robust Read" (was: [PATCH 1/2] md bitmap bug fixes) Michael Tokarev 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 12:53 UTC (permalink / raw) To: linux-raid Michael Tokarev <mjt@tls.msk.ru> wrote: > Ok, you intrigued me enouth already.. what's the FR1 patch? I want > to give it a try... ;) Especially I'm interested in the "Robust Read" > thing... That was published on this list a few weeks ago (probably needs updating, but I am sure you can help :-). Google "linux raid robust read patch for raid1". I see a pointer to at least http://www.spinics.net/lists/raid/msg07732.html but glancing at it I can't see if it's the latest. In particular I can't see a correction I made at one point that sets the current rdev before calling the map() function ... In the FR1 patch it's all delineated by #ifdef blah_ROBUST_READ_blah. That's at ftp://oboe.it.uc3m.es/pub/Programs/fr1-2.16.tgz The patch was originally developed for 2.4, then ported to 2.6.3, and then to 2.6.8.1. Neil has recently been doing stuff, so I don't think it applies cleanly to 2.6.10, but somebody WAS porting it for me until they found that 2.6.10 didn't support their hardware ... and I recall discussing with him what to do about the change of map() to read_balance() in the code (essentially, put map() back). And finding that the spinlocks have changed too. So essentially, if you want to update the patch for 2.6.10 or above, please do! Here's my quick extraction for 2.6.8.1 (a small hunk that I probably needed for the patches published here is the one at the end). Peter --- ./drivers/md/raid1.c.pre-fr1 Sat Dec 18 22:37:14 2004 +++ ./drivers/md/raid1.c Sun Jan 16 13:18:42 2005 @@ -200,6 +234,32 @@ */ spin_lock_irq(&conf->device_lock); +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * Uh, no. Choose the next disk if we can, not the first. + */ + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].rdev == *rdevp) { + i++; + break; + } + } + if (i >= disks) + i = 0; + for (; i < disks; i++) { + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev && rdev != *rdevp && rdev->in_sync) { + *rdevp = rdev; + atomic_inc(&rdev->nr_pending); + spin_unlock_irq(&conf->device_lock); + return i; + } + } + /* + * If for some reason we found nothing, dropthru and use the old + * routine. + */ +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ for (i = 0; i < disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && rdev->in_sync) { @@ -266,9 +368,19 @@ /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else + if (!uptodate) { +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* + * Only fault disk out of array on write error, not read. + */ + if (0) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); +#ifdef DO_ADD_READ_WRITE_CORRECT + else /* tell next time we're here that we're a retry */ + set_bit(R1BIO_ReadRetry, &r1_bio->state); +#endif /* DO_ADD_READ_WRITE_CORRECT */ + } else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -285,8 +397,20 @@ /* * we have only one bio on the read side */ - if (uptodate) - raid_end_bio_io(r1_bio); + if (uptodate +#ifdef CONFIG_MD_RAID1_ROBUST_READ + /* Give up and error if we're last */ + || (atomic_dec_and_test(&r1_bio->remaining)) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + ) +#ifdef DO_ADD_READ_WRITE_CORRECT + if (uptodate && test_bit(R1BIO_ReadRetry, &r1_bio->state)) { + /* Success at last - rewrite failed reads */ + set_bit(R1BIO_IsSync, &r1_bio->state); + reschedule_retry(r1_bio); + } else +#endif /* DO_ADD_READ_WRITE_CORRECT */ + raid_end_bio_io(r1_bio); else { /* * oops, read error: @@ -560,6 +716,19 @@ read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ; read_bio->bi_private = r1_bio; +#ifdef CONFIG_MD_RAID1_ROBUST_READ + atomic_set(&r1_bio->remaining, 0); + /* count source devices under spinlock */ + spin_lock_irq(&conf->device_lock); + disks = conf->raid_disks; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].rdev && + !conf->mirrors[i].rdev->faulty) { + atomic_inc(&r1_bio->remaining); + } + } + spin_unlock_irq(&conf->device_lock); +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ generic_make_request(read_bio); return 0; @@ -925,6 +1249,9 @@ } else { int disk; bio = r1_bio->bios[r1_bio->read_disk]; +#ifdef CONFIG_MD_RAID1_ROBUST_READ + rdev = conf->mirrors[r1_bio->read_disk].rdev; +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ if ((disk=map(mddev, &rdev)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" " read error for block %llu\n", ^ permalink raw reply [flat|nested] 71+ messages in thread
* "Robust Read" (was: [PATCH 1/2] md bitmap bug fixes) 2005-03-19 12:53 ` Peter T. Breuer @ 2005-03-19 16:08 ` Michael Tokarev 2005-03-19 17:03 ` "Robust Read" Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Michael Tokarev @ 2005-03-19 16:08 UTC (permalink / raw) To: linux-raid [-- Attachment #1: Type: text/plain, Size: 2758 bytes --] Peter T. Breuer wrote: [] > The patch was originally developed for 2.4, then ported to 2.6.3, and > then to 2.6.8.1. Neil has recently been doing stuff, so I don't > think it applies cleanly to 2.6.10, but somebody WAS porting it for me > until they found that 2.6.10 didn't support their hardware ... and I > recall discussing with him what to do about the change of map() to > read_balance() in the code (essentially, put map() back). And finding > that the spinlocks have changed too. Well, it turns out current code is easier to modify, including spinlocks. But now, with read_balance() in place, which can pick a "random" disk to read from, we have to keep some sort of bitmap to mark disks which we tried to read from. For the hack below I've added r1_bio->tried_disk of type unsigned long which has one bit per disk, so current scheme is limited to 32 disks in array. This is really a hack for now -- I don't know much about kernel programming rules... ;) > @@ -266,9 +368,19 @@ > /* > * this branch is our 'one mirror IO has finished' event handler: > */ > - if (!uptodate) > - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); > - else > + if (!uptodate) { > +#ifdef CONFIG_MD_RAID1_ROBUST_READ > + /* > + * Only fault disk out of array on write error, not read. > + */ > + if (0) > +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ > + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); > +#ifdef DO_ADD_READ_WRITE_CORRECT What's this? Was it an attempt (incomplete) to do rewrite-after-failed-read? > + else /* tell next time we're here that we're a retry */ > + set_bit(R1BIO_ReadRetry, &r1_bio->state); > +#endif /* DO_ADD_READ_WRITE_CORRECT */ > + } else > /* > * Set R1BIO_Uptodate in our master bio, so that > * we will return a good error code for to the higher > @@ -285,8 +397,20 @@ > /* > * we have only one bio on the read side > */ > - if (uptodate) > - raid_end_bio_io(r1_bio); > + if (uptodate > +#ifdef CONFIG_MD_RAID1_ROBUST_READ > + /* Give up and error if we're last */ > + || (atomic_dec_and_test(&r1_bio->remaining)) > +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ > + ) > +#ifdef DO_ADD_READ_WRITE_CORRECT > + if (uptodate && test_bit(R1BIO_ReadRetry, &r1_bio->state)) { > + /* Success at last - rewrite failed reads */ > + set_bit(R1BIO_IsSync, &r1_bio->state); > + reschedule_retry(r1_bio); > + } else > +#endif /* DO_ADD_READ_WRITE_CORRECT */ > + raid_end_bio_io(r1_bio); Hmm. Should we do actual write here, instead of rescheduling a successeful read further, after finishing the original request? /mjt [-- Attachment #2: raid1_robust_read-2.6.11.diff --] [-- Type: text/plain, Size: 3107 bytes --] --- ./include/linux/raid/raid1.h.orig Wed Mar 2 10:38:10 2005 +++ ./include/linux/raid/raid1.h Sat Mar 19 18:53:42 2005 @@ -83,6 +83,7 @@ struct r1bio_s { * if the IO is in READ direction, then this is where we read */ int read_disk; + unsigned long tried_disks; /* bitmap, disks we had tried to read from */ struct list_head retry_list; /* --- ./drivers/md/raid1.c.orig Wed Mar 2 10:38:10 2005 +++ ./drivers/md/raid1.c Sat Mar 19 18:57:16 2005 @@ -234,9 +234,13 @@ static int raid1_end_read_request(struct /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else + + update_head_pos(mirror, r1_bio); + + /* + * we have only one bio on the read side + */ + if (uptodate) { /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -247,14 +251,8 @@ static int raid1_end_read_request(struct * wait for the 'master' bio. */ set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); - - /* - * we have only one bio on the read side - */ - if (uptodate) raid_end_bio_io(r1_bio); + } else { /* * oops, read error: @@ -332,6 +330,10 @@ static int raid1_end_write_request(struc * * The rdev for the device selected will have nr_pending incremented. */ + +#define disk_tried_before(r1_bio, disk) ((r1_bio)->tried_disks & (1<<(disk))) +#define mark_disk_tried(r1_bio, disk) ((r1_bio)->tried_disks |= 1<<(disk)) + static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; @@ -351,7 +353,8 @@ static int read_balance(conf_t *conf, r1 new_disk = 0; while (!conf->mirrors[new_disk].rdev || - !conf->mirrors[new_disk].rdev->in_sync) { + !conf->mirrors[new_disk].rdev->in_sync || + disk_tried_before(r1_bio, new_disk)) { new_disk++; if (new_disk == conf->raid_disks) { new_disk = -1; @@ -364,7 +367,8 @@ static int read_balance(conf_t *conf, r1 /* make sure the disk is operational */ while (!conf->mirrors[new_disk].rdev || - !conf->mirrors[new_disk].rdev->in_sync) { + !conf->mirrors[new_disk].rdev->in_sync || + disk_tried_before(r1_bio, new_disk)) { if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; @@ -394,7 +398,8 @@ static int read_balance(conf_t *conf, r1 disk--; if (!conf->mirrors[disk].rdev || - !conf->mirrors[disk].rdev->in_sync) + !conf->mirrors[disk].rdev->in_sync || + disk_tried_before(r1_bio, new_disk)) continue; if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) { @@ -415,6 +420,7 @@ rb_out: conf->next_seq_sect = this_sector + sectors; conf->last_used = new_disk; atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending); + mark_disk_tried(r1_bio, new_disk); } rcu_read_unlock(); @@ -545,6 +551,7 @@ static int make_request(request_queue_t r1_bio->sector = bio->bi_sector; r1_bio->state = 0; + r1_bio->tried_disks = 0; if (bio_data_dir(bio) == READ) { /* ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: "Robust Read" 2005-03-19 16:08 ` "Robust Read" (was: [PATCH 1/2] md bitmap bug fixes) Michael Tokarev @ 2005-03-19 17:03 ` Peter T. Breuer 2005-03-19 20:20 ` Michael Tokarev 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 17:03 UTC (permalink / raw) To: linux-raid Michael Tokarev <mjt@tls.msk.ru> wrote: > [-- text/plain, encoding 7bit, charset: KOI8-R, 74 lines --] > > Peter T. Breuer wrote: > [] > > The patch was originally developed for 2.4, then ported to 2.6.3, and > > then to 2.6.8.1. Neil has recently been doing stuff, so I don't > > think it applies cleanly to 2.6.10, but somebody WAS porting it for me > > until they found that 2.6.10 didn't support their hardware ... and I > > recall discussing with him what to do about the change of map() to > > read_balance() in the code (essentially, put map() back). And finding > > that the spinlocks have changed too. > > Well, it turns out current code is easier to modify, including spinlocks. > > But now, with read_balance() in place, which can pick a "random" disk > to read from, we have to keep some sort of bitmap to mark disks which > we tried to read from. Yes - what one should do is replace the present call to read_balance() (in the place where map() used to be) with the old call to map() instead, and add in the patched definition of map(). > For the hack below I've added r1_bio->tried_disk of type unsigned long > which has one bit per disk, so current scheme is limited to 32 disks > in array. This is really a hack for now -- I don't know much about > kernel programming rules... ;) Uh OK. As I recall one only needs to count, one doesn't need a bitwise map of what one has dealt with. > > > @@ -266,9 +368,19 @@ > > /* > > * this branch is our 'one mirror IO has finished' event handler: > > */ > > - if (!uptodate) > > - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); > > - else > > + if (!uptodate) { > > +#ifdef CONFIG_MD_RAID1_ROBUST_READ > > + /* > > + * Only fault disk out of array on write error, not read. > > + */ > > + if (0) > > +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ > > + md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); > > +#ifdef DO_ADD_READ_WRITE_CORRECT > > What's this? > Was it an attempt (incomplete) to do rewrite-after-failed-read? Yes. I never tried it (not being brave). It is my indication of how to do it. Essentially, if we have retried a read, and succeeded on the retry, now put the done request on the raid1d thread's list, of syncy-thingies (careful to keep reference counts high), and mark it as already having done the read half, and let the raid1d thread do a write with it now. > > > + else /* tell next time we're here that we're a retry */ > > + set_bit(R1BIO_ReadRetry, &r1_bio->state); > > +#endif /* DO_ADD_READ_WRITE_CORRECT */ See - if we are planning on writing successfully retried reads, we want to mark the retries. > > + } else That was all the case when a read fails. We DON'T fault the disk. Instead we drop through and will retry. A count will save us from doing it in circles. > > /* > > * Set R1BIO_Uptodate in our master bio, so that > > * we will return a good error code for to the higher > > @@ -285,8 +397,20 @@ > > /* > > * we have only one bio on the read side > > */ > > - if (uptodate) > > - raid_end_bio_io(r1_bio); > > + if (uptodate > > +#ifdef CONFIG_MD_RAID1_ROBUST_READ > > + /* Give up and error if we're last */ > > + || (atomic_dec_and_test(&r1_bio->remaining)) > > +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ Yes, I told you there was a count! If we have already tried reading all disks, then really really error. Thet "remaining" field was originally not used in the read request case, but was used in the write request case, so I co-opted it for reads too. I suppose it will be set when a read request is made (according to the patch). > > + ) > > +#ifdef DO_ADD_READ_WRITE_CORRECT > > + if (uptodate && test_bit(R1BIO_ReadRetry, &r1_bio->state)) { > > + /* Success at last - rewrite failed reads */ > > + set_bit(R1BIO_IsSync, &r1_bio->state); > > + reschedule_retry(r1_bio); > > + } else > > +#endif /* DO_ADD_READ_WRITE_CORRECT */ Well, if this is really a successful retried read request, then send the bio to the raid1d thread for writing everywhere, after first setting a bit to trick it that it has done the read part of its work. > > + raid_end_bio_io(r1_bio); Ummmm. > Hmm. Should we do actual write here, instead of rescheduling a Well, the rewrite code is untried, but the logic goes as I outlined above. We just schedule the write. > successeful read further, after finishing the original request? Maybe, maybe not. I don't know what or if anything crashes. I don't THINK the buffers in the i/o will disappear, and I THINK the user has been advised that the i/o is done, and even if he hasn't been, he will be shortly when the raid1d thread does its work, so why worry? Yes, this might delay, but read errors should be rare anyway. I'm a little more worried about the end_io. Apparently we end_io if we're uptodate or not uptodate and are the last possible read and not uptodate or not a retry Ummm. 1 not uptodate and are the last possible read or 2 uptodate and not a retry or 3 not uptodate and not a retry and are the last possible read Well, (1) is correct. We return an error. (2) is correct. We return success on first try (3) is correct. We only have one disk and we errored, and return error. But what if we have success and we are not the first try? Ah .. I see. We store the i/o for the raid1d thread and hope it advises the user when finished. Something tells me that should be thought about. > > /mjt > > [-- text/plain, encoding 7bit, charset: US-ASCII, 103 lines, name: raid1_robust_read-2.6.11.diff --] > > --- ./include/linux/raid/raid1.h.orig Wed Mar 2 10:38:10 2005 > +++ ./include/linux/raid/raid1.h Sat Mar 19 18:53:42 2005 > @@ -83,6 +83,7 @@ struct r1bio_s { > * if the IO is in READ direction, then this is where we read > */ > int read_disk; > + unsigned long tried_disks; /* bitmap, disks we had tried to read from */ You don't need this because the count is in the "remaining" field. I hope. > struct list_head retry_list; > /* > --- ./drivers/md/raid1.c.orig Wed Mar 2 10:38:10 2005 > +++ ./drivers/md/raid1.c Sat Mar 19 18:57:16 2005 > @@ -234,9 +234,13 @@ static int raid1_end_read_request(struct > /* > * this branch is our 'one mirror IO has finished' event handler: > */ > - if (!uptodate) > - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); > - else > + > + update_head_pos(mirror, r1_bio); > + Oh well, if you say so! I suppose that indeed the heads have moved. > + /* > + * we have only one bio on the read side > + */ > + if (uptodate) { > /* > * Set R1BIO_Uptodate in our master bio, so that > * we will return a good error code for to the higher > @@ -247,14 +251,8 @@ static int raid1_end_read_request(struct > * wait for the 'master' bio. > */ > set_bit(R1BIO_Uptodate, &r1_bio->state); > - > - update_head_pos(mirror, r1_bio); Hmm, it came from a bit lower down. I see. > - > - /* > - * we have only one bio on the read side > - */ > - if (uptodate) > raid_end_bio_io(r1_bio); > + } So if we are uptodate, we set the state bit and do end_io. Isn't this a bit early to be doing that? Shouldn't we check first to see if we are a retried read, and if we are, then try and trigger a write request from the freshly read buffer? Or are you, like me, not going to be brave enough to try the write part! > else { > /* > * oops, read error: > @@ -332,6 +330,10 @@ static int raid1_end_write_request(struc > * > * The rdev for the device selected will have nr_pending incremented. > */ > + > +#define disk_tried_before(r1_bio, disk) ((r1_bio)->tried_disks & (1<<(disk))) > +#define mark_disk_tried(r1_bio, disk) ((r1_bio)->tried_disks |= 1<<(disk)) > + Well, as I said, you don't have to do this if a) you count using the "remaining" field, and b) you use the patched map() function instead of the read_balance() function to choose the target disk, and make sure that map() always chooses the next (active) disk along from where we are now. Oooh - groovy, I still have the code. #ifdef CONFIG_MD_RAID1_ROBUST_READ static int map(mddev_t *mddev, int disk) { conf_t *conf = mddev_to_conf(mddev); int i, disks = conf->raid_disks; /* * Later we do read balancing on the read side * now we use the first available disk. */ spin_lock_irq(&conf->device_lock); /* * Uh, no. Choose the next disk if we can, not the first. */ i = disk + 1; if (i >= disks) i = 0; for (; i < disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && i != disk && rdev->in_sync) { atomic_inc(&rdev->nr_pending); spin_unlock_irq(&conf->device_lock); return i; } } /* * If for some reason we found nothing, dropthru and use the * old * routine. */ for (i = 0; i < disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && rdev->in_sync) { *rdevp = rdev; atomic_inc(&rdev->nr_pending); spin_unlock_irq(&conf->device_lock); return i; } } spin_unlock_irq(&conf->device_lock); printk(KERN_ERR "raid1_map(): huh, no more operational devices?\n"); return -1; } #endif /* CONFIG_MD_RAID1_ROBUST_READ */ and then in the raid1d function: @@ -932,9 +1282,17 @@ sync_request_write(mddev, r1_bio); unplug = 1; } else { +#ifdef CONFIG_MD_RAID1_ROBUST_READ + int disk = r1_bio->read_disk; + bio = r1_bio->bios[disk]; + rdev = conf->mirrors[disk].rdev; + if ((disk=map(mddev, disk)) < 0) +#else int disk; bio = r1_bio->bios[r1_bio->read_disk]; - if ((disk=read_balance(conf, r1_bio)) == -1) { + if ((disk=read_balance(conf, r1_bio)) == -1) +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ + { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" " read error for block %llu\n", bdevname(bio->bi_bdev,b), (if I count lines correctly - sorry, I am editing by hand). > static int read_balance(conf_t *conf, r1bio_t *r1_bio) > { > const unsigned long this_sector = r1_bio->sector; > @@ -351,7 +353,8 @@ static int read_balance(conf_t *conf, r1 > new_disk = 0; > > while (!conf->mirrors[new_disk].rdev || > - !conf->mirrors[new_disk].rdev->in_sync) { > + !conf->mirrors[new_disk].rdev->in_sync || > + disk_tried_before(r1_bio, new_disk)) { Yes, I don't think you should muck with read_balance at all! It's an approach, but I don't think it's the right one. We just want to use the function to change target disks with here. Why not reinstate the old "map" function which can do just that? Then no change to read_balancce is needed. Rename map() to remap() if it sounds better! > new_disk++; > if (new_disk == conf->raid_disks) { > new_disk = -1; > @@ -364,7 +367,8 @@ static int read_balance(conf_t *conf, r1 > > /* make sure the disk is operational */ > while (!conf->mirrors[new_disk].rdev || > - !conf->mirrors[new_disk].rdev->in_sync) { > + !conf->mirrors[new_disk].rdev->in_sync || > + disk_tried_before(r1_bio, new_disk)) { Are you still in read_balance? I guess so. So you get read balance to skip disks we've already tried. OK - it's an approach. > if (new_disk <= 0) > new_disk = conf->raid_disks; > new_disk--; > @@ -394,7 +398,8 @@ static int read_balance(conf_t *conf, r1 > disk--; > > if (!conf->mirrors[disk].rdev || > - !conf->mirrors[disk].rdev->in_sync) > + !conf->mirrors[disk].rdev->in_sync || > + disk_tried_before(r1_bio, new_disk)) > continue; Same. > > if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) { > @@ -415,6 +420,7 @@ rb_out: > conf->next_seq_sect = this_sector + sectors; > conf->last_used = new_disk; > atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending); > + mark_disk_tried(r1_bio, new_disk); :-). We have gone aways down. Not sure where we are now. > } > rcu_read_unlock(); > > @@ -545,6 +551,7 @@ static int make_request(request_queue_t > r1_bio->sector = bio->bi_sector; > > r1_bio->state = 0; > + r1_bio->tried_disks = 0; Hmm. Ditto. I suppose we are making the read request here. Yes we are. I had set the "remaining" count instead. @@ -570,6 +754,19 @@ read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ; read_bio->bi_private = r1_bio; +#ifdef CONFIG_MD_RAID1_ROBUST_READ + atomic_set(&r1_bio->remaining, 0); + /* count source devices under spinlock */ + spin_lock_irq(&conf->device_lock); + disks = conf->raid_disks; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].rdev && + !conf->mirrors[i].rdev->faulty) { + atomic_inc(&r1_bio->remaining); + } + } + spin_unlock_irq(&conf->device_lock); +#endif /* CONFIG_MD_RAID1_ROBUST_READ */ generic_make_request(read_bio); return 0; @@ -589,13 +786,53 @@ but that may need some change - there may even be a count of valid disks somewhere. > > if (bio_data_dir(bio) == READ) { > /* What do you think of my comments above? ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: "Robust Read" 2005-03-19 17:03 ` "Robust Read" Peter T. Breuer @ 2005-03-19 20:20 ` Michael Tokarev 2005-03-19 20:56 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Michael Tokarev @ 2005-03-19 20:20 UTC (permalink / raw) To: linux-raid Peter T. Breuer wrote: > Michael Tokarev <mjt@tls.msk.ru> wrote: > >>[-- text/plain, encoding 7bit, charset: KOI8-R, 74 lines --] Uh-oh. Setting proper charset this time. What a nonsense: 7bit but koi8... ;) [] > Uh OK. As I recall one only needs to count, one doesn't need a bitwise > map of what one has dealt with. Well. I see read_balance() is now used to resubmit reads. There's a reason to use it instead of choosing "next" disk, I think. There's a reason to NOT use it too, as the "bitmap" will not be needed in that case, only the counter. Both will do the task ofcourse, and the case wich requires only counter is preferred because it does not restrict number of disks in the array the way "bitmap" does. Ok, so be it the "map" variant when.. and it's even simpler that way. [big snip] > and then in the raid1d function: BTW, what's the reason to use raid1d for retries in the first place? Why not just resubmit the request in reschedule_retry() directly, the same way it is done in raid1d? [] > Yes, I don't think you should muck with read_balance at all! It's an > approach, but I don't think it's the right one. We just want to use the > function to change target disks with here. Why not reinstate the old > "map" function which can do just that? Then no change to read_balancce > is needed. Rename map() to remap() if it sounds better! The key point is that there should be not many read errors. Given said that, I agree, simpler routine for retries is ok. [] >>@@ -545,6 +551,7 @@ static int make_request(request_queue_t >> r1_bio->sector = bio->bi_sector; >> >> r1_bio->state = 0; >>+ r1_bio->tried_disks = 0; > > Hmm. Ditto. I suppose we are making the read request here. Yes we are. > I had set the "remaining" count instead. BTW there's no reason to zero any fields in r1_bio because of the way it is allocated (with memset(0) all of it). [] > What do you think of my comments above? Ok. But.. It's very difficult to NOT go into numerous-cleanups mode now. I want to clean up the code, to make it better first!.. ;) And I pretty much sure it will conflict with ongoing work Neil is doing... ;) /mjt ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: "Robust Read" 2005-03-19 20:20 ` Michael Tokarev @ 2005-03-19 20:56 ` Peter T. Breuer 2005-03-19 22:05 ` Michael Tokarev 0 siblings, 1 reply; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 20:56 UTC (permalink / raw) To: linux-raid Michael Tokarev <mjt@tls.msk.ru> wrote: > > Uh OK. As I recall one only needs to count, one doesn't need a bitwise > > map of what one has dealt with. > > Well. I see read_balance() is now used to resubmit reads. There's > a reason to use it instead of choosing "next" disk, I think. I can't think of one myself! We know what disk we want - the next available one that's not the present one. We don't need to calculate which is needed according to head positions. But your approach is fine - it's just that (a) I don't like to mess with the struct sizes, as that makes modules binary incompatible, instead of just having extra functionalities, and (b) I really think it's not going to be easily maintainable to subvert read_balance, and (c), it fragments the patch more, and (d), I really think you only need to count and pray, not keep an exact map. Maybe if you were to use the existing "remaining" field for the birmap, you would get rid of objection (a)! :-). You could even rename it via a #define :-). > There's > a reason to NOT use it too, as the "bitmap" will not be needed in > that case, only the counter. Both will do the task ofcourse, and > the case wich requires only counter is preferred because it does > not restrict number of disks in the array the way "bitmap" does. > > Ok, so be it the "map" variant when.. and it's even simpler that way. Well, either way is fine! Don't let me restrict you! Use your own best judgment. > > and then in the raid1d function: > > BTW, what's the reason to use raid1d for retries in the first place? > Why not just resubmit the request in reschedule_retry() directly, > the same way it is done in raid1d? No reason that I recall. Isn't raid1d going to do it offline? You want to do it sync instead? Anyway, the rerite code is completely uninvestigated, so please try anything you like! > > Yes, I don't think you should muck with read_balance at all! It's an > > approach, but I don't think it's the right one. We just want to use the > > function to change target disks with here. Why not reinstate the old > > "map" function which can do just that? Then no change to read_balancce > > is needed. Rename map() to remap() if it sounds better! > > The key point is that there should be not many read errors. Given said > that, I agree, simpler routine for retries is ok. > > [] > >>@@ -545,6 +551,7 @@ static int make_request(request_queue_t > >> r1_bio->sector = bio->bi_sector; > >> > >> r1_bio->state = 0; > >>+ r1_bio->tried_disks = 0; > > > > Hmm. Ditto. I suppose we are making the read request here. Yes we are. > > I had set the "remaining" count instead. > > BTW there's no reason to zero any fields in r1_bio because of the > way it is allocated (with memset(0) all of it). Somebody should mention it to Neil :-). > > What do you think of my comments above? > > Ok. > > But.. It's very difficult to NOT go into numerous-cleanups mode now. Don't. Just the patch first. The aim of a patch is to make a minimal change. > I want to clean up the code, to make it better first!.. ;) And I > pretty much sure it will conflict with ongoing work Neil is doing... ;) Exactly. Aim for the minimal patch. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: "Robust Read" 2005-03-19 20:56 ` Peter T. Breuer @ 2005-03-19 22:05 ` Michael Tokarev 2005-03-19 22:30 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Michael Tokarev @ 2005-03-19 22:05 UTC (permalink / raw) To: linux-raid [-- Attachment #1: Type: text/plain, Size: 1801 bytes --] Peter T. Breuer wrote: > Michael Tokarev <mjt@tls.msk.ru> wrote: [] > But your approach is fine - it's just that (a) I don't like to mess > with the struct sizes, as that makes modules binary incompatible, > instead of just having extra functionalities, and (b) I really think > it's not going to be easily maintainable to subvert read_balance, and > (c), it fragments the patch more, and (d), I really think you only need > to count and pray, not keep an exact map. The point a) is moot, because this whole structure is used in raid1.c ONLY. (I don't know why it is placed into raid1.h header file instead of into raid1.c directly, but that's a different topic). > Maybe if you were to use the existing "remaining" field for the birmap, > you would get rid of objection (a)! :-). You could even rename it via a > #define :-). atomic_t has its own API, it can't be used for &= and |= operations for example. >>BTW, what's the reason to use raid1d for retries in the first place? >>Why not just resubmit the request in reschedule_retry() directly, >>the same way it is done in raid1d? > > No reason that I recall. Isn't raid1d going to do it offline? You want > to do it sync instead? Anyway, the rerite code is completely > uninvestigated, so please try anything you like! I asked about retrying failed read, not about re-writes. [] >>>>+ r1_bio->tried_disks = 0; >>> >>>Hmm. Ditto. I suppose we are making the read request here. Yes we are. >>>I had set the "remaining" count instead. Note in your version you have a chance to request a read from first disk again, in case you have some failed/missing disks. So i think that instead of usage of "remaining" field, it's better to remember the disk from which we did the first read, and loop over all disks up to the first one. /mjt [-- Attachment #2: raid1_robust_read-2.6.11.diff --] [-- Type: text/plain, Size: 2809 bytes --] --- ./include/linux/raid/raid1.h.orig Wed Mar 2 10:38:10 2005 +++ ./include/linux/raid/raid1.h Sun Mar 20 00:50:59 2005 @@ -83,6 +83,7 @@ struct r1bio_s { * if the IO is in READ direction, then this is where we read */ int read_disk; + unsigned long first_disk; /* disk we started read from */ struct list_head retry_list; /* --- ./drivers/md/raid1.c.orig Wed Mar 2 10:38:10 2005 +++ ./drivers/md/raid1.c Sun Mar 20 00:51:57 2005 @@ -231,12 +231,13 @@ static int raid1_end_read_request(struct return 1; mirror = r1_bio->read_disk; + + update_head_pos(mirror, r1_bio); + /* - * this branch is our 'one mirror IO has finished' event handler: + * we have only one bio on the read side */ - if (!uptodate) - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else + if (uptodate) { /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -247,14 +248,8 @@ static int raid1_end_read_request(struct * wait for the 'master' bio. */ set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); - - /* - * we have only one bio on the read side - */ - if (uptodate) raid_end_bio_io(r1_bio); + } else { /* * oops, read error: @@ -421,6 +416,42 @@ rb_out: return new_disk; } +/* + * This routine returns the disk from which the requested read should + * be done in case previous read failed. We don't try to optimize + * read positions here as in read_balance(). + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_remap(conf_t *conf, r1bio_t *r1_bio) +{ + int disk; + int first_disk; + + rcu_read_lock(); + + /* Choose the next operation device */ + first_disk = r1_bio->first_disk; + disk = r1_bio->read_disk; + + do { + ++disk; + if (disk >= conf->raid_disks) + disk = 0; + if (disk == first_disk) { + rcu_read_unlock(); + return -1; + } + } while (!conf->mirrors[disk].rdev || + !conf->mirrors[disk].rdev->in_sync); + + /* don't bother updating last position */ + atomic_inc(&conf->mirrors[disk].rdev->nr_pending); + rcu_read_unlock(); + + return disk; +} + static void unplug_slaves(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); @@ -560,6 +591,7 @@ static int make_request(request_queue_t mirror = conf->mirrors + rdisk; r1_bio->read_disk = rdisk; + r1_bio->first_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); @@ -934,7 +966,7 @@ static void raid1d(mddev_t *mddev) } else { int disk; bio = r1_bio->bios[r1_bio->read_disk]; - if ((disk=read_balance(conf, r1_bio)) == -1) { + if ((disk=read_remap(conf, r1_bio)) < 0) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" " read error for block %llu\n", bdevname(bio->bi_bdev,b), ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: "Robust Read" 2005-03-19 22:05 ` Michael Tokarev @ 2005-03-19 22:30 ` Peter T. Breuer 0 siblings, 0 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-19 22:30 UTC (permalink / raw) To: linux-raid Michael Tokarev <mjt@tls.msk.ru> wrote: > The point a) is moot, because this whole structure is used in raid1.c ONLY. > (I don't know why it is placed into raid1.h header file instead of into > raid1.c directly, but that's a different topic). Hmm. I'm a little surprised. I would be worried that it had escaped somewhere. > > Maybe if you were to use the existing "remaining" field for the birmap, > > you would get rid of objection (a)! :-). You could even rename it via a > > #define :-). > > atomic_t has its own API, it can't be used for &= and |= operations for > example. Ah - was the remaining field atomic_t, then? But one can use it - at least in i386. I recall there are bit ops. Yes. Atomic.h /* These are x86-specific, used by some header files */ #define atomic_clear_mask(mask, addr) \ __asm__ __volatile__(LOCK ändl %0,%1" \ : : "r" (~(mask)),"m" (*addr) : "memory") #define atomic_set_mask(mask, addr) \ __asm__ __volatile__(LOCK örl %0,%1" \ : : "r" (mask),"m" (*(addr)) : "memory") I also seem to recall "writing" equivalents for sparc - where I think they were just the ordinary ops. But take no notice. > >>BTW, what's the reason to use raid1d for retries in the first place? > >>Why not just resubmit the request in reschedule_retry() directly, > >>the same way it is done in raid1d? > > > > No reason that I recall. Isn't raid1d going to do it offline? You want > > to do it sync instead? Anyway, the rerite code is completely > > uninvestigated, so please try anything you like! > > I asked about retrying failed read, not about re-writes. > Sorry - I wasn't following too closely because of a conversation with Lars, and I wan't looking at my original patch, so .... but as I recall, the failed read is retried with reschedule_retry(), since that's what happens when the flow falls out the bottom of the patch int the remainder of the raid1_end_read_request() code, after an not-uptdate request is seen. /* * we have only one bio on the read side */ if (uptodate #ifdef CONFIG_MD_RAID1_ROBUST_READ /* Give up and error if we're last */ || atomic_dec_and_test(&r1_bio->remaining) #endif /* CONFIG_MD_RAID1_ROBUST_READ */ ) #ifdef DO_ADD_READ_WRITE_CORRECT if (uptodate && test_bit(R1BIO_ReadRetry, &r1_bio->state)) { /* Success at last - rewrite failed reads */ set_bit(R1BIO_IsSync, &r1_bio->state); reschedule_retry(r1_bio); } else #endif /* DO_ADD_READ_WRITE_CORRECT */ raid_end_bio_io(r1_bio); else { /* * oops, read error: */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", bdevname(conf->mirrors[mirror].rdev->bdev,b), (un signed long long)r1_bio->sector); reschedule_retry(r1_bio); ^^^^^^^^^^^^^^^^^^^^^^^^^^ here } Or do you mean "skip the r_r call and do what it would do instead"? Well, that would broaden the patch cross-section here at least. What r_r does is just place things on the retry list, and shout: list_add(&r1_bio->retry_list, &retry_list_head); md_wakeup_thread(mddev->thread); The raid1d thread on the read side apears to redirect the bio, when it pics it up, and resubmit it: r1_bio->read_disk = disk; // we chose this with map() ... bio = bio_clone(r1_bio->master_bio, GFP_NOIO); ... generic_make_request(bio); I don't see why one shouldn't take advantage of all that code. It didn't need altering. I only had to add a little bit on the write side. > [] > >>>>+ r1_bio->tried_disks = 0; > >>> > >>>Hmm. Ditto. I suppose we are making the read request here. Yes we are. > >>>I had set the "remaining" count instead. > > Note in your version you have a chance to request a read from > first disk again, in case you have some failed/missing disks. I don't think so - unless you suspect a race condition between retrying and changing the config. One can forget that - the worst that can happen is that we try the same disk again and fail again, which will ultimately lead to an error return. But the admin deserves it for changing config while a request is busy failing and looking for new disks to try :-). I wouldn't go solving likely very rare problems at this stage until the problem turns out to have significance. Leave a comment in the code if you see the problem. > So i think that instead of usage of "remaining" field, it's better > to remember the disk from which we did the first read, and loop > over all disks up to the first one. That's a possibility too, but as I said, getting things wrong is not disastrous! We can't do worse that what happened without the patch. If you like, try "n+1" times instead of "n" times, and that should make real sure! Keep it simple. Peter - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 1/2] md bitmap bug fixes 2005-03-14 4:43 ` [PATCH 1/2] " Neil Brown 2005-03-14 9:44 ` Lars Marowsky-Bree @ 2005-03-15 4:24 ` Paul Clements 2005-03-17 20:51 ` [PATCH 0/3] md bitmap-based asynchronous writes Paul Clements 2 siblings, 0 replies; 71+ messages in thread From: Paul Clements @ 2005-03-15 4:24 UTC (permalink / raw) To: Neil Brown; +Cc: linux-raid Neil Brown wrote: > On Wednesday March 9, paul.clements@steeleye.com wrote: >>avoid setting of sb->events_lo = 1 when creating a 0.90 superblock -- it >>doesn't seem to be necessary and it was causing the event counters to >>start at 4 billion+ (events_lo is actually the high part of the events >>counter, on little endian machines anyway) > events_lo really should be the low part of the counter and it is for > me.... something funny must be happening for you... Yikes...compiling mdadm against the kernel headers. I was trying to simplify things and avoid the inevitable breakage that occurs when kernel and mdadm headers get out of sync, but alas, it's causing problems because of differences between kernel and userland header definitions...my mdadm was wrongly assuming big endian for the events counters. >>if'ed out super1 definition which is now in the kernel headers > I don't like this. I don't mdadm to include the kernel raid headers. > I want it to use it's own. Yes, I agree, see above... :/ >>included sys/time.h to avoid compile error > I wonder why I don't get an error.. What error do you get? The machine I happen to be compiling on has old gcc/libc (2.91) and it's not getting the definition for one of the time structures (I forget which...). Thanks, Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH 0/3] md bitmap-based asynchronous writes 2005-03-14 4:43 ` [PATCH 1/2] " Neil Brown 2005-03-14 9:44 ` Lars Marowsky-Bree 2005-03-15 4:24 ` [PATCH 1/2] md bitmap bug fixes Paul Clements @ 2005-03-17 20:51 ` Paul Clements 2005-03-17 20:53 ` [PATCH 1/3] md bitmap async write enabling Paul Clements 2005-03-21 4:21 ` [PATCH 0/3] md bitmap-based asynchronous writes Neil Brown 2 siblings, 2 replies; 71+ messages in thread From: Paul Clements @ 2005-03-17 20:51 UTC (permalink / raw) To: Neil Brown; +Cc: linux-raid These three patches provide the ability to perform asynchronous writes with raid1. The asynchronous write capability is primarily useful when raid1 is employed in network replication (i.e., with one or more of the disks located on a remote system). It allows us to acknowledge writes before they reach the remote system, thus making much more efficient use of the network. The kernel patches build on the md bitmap code and the 117WriteMostly-update patch. The first patch provides the framework for async writes. The second implements async writes for raid1. The third patch (for mdadm) provides the write-mostly updates and the ability to configure raid1 arrays for async writes. Thnaks, Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH 1/3] md bitmap async write enabling 2005-03-17 20:51 ` [PATCH 0/3] md bitmap-based asynchronous writes Paul Clements @ 2005-03-17 20:53 ` Paul Clements 2005-03-17 20:55 ` [PATCH 2/3] md bitmap async writes for raid1 Paul Clements 2005-03-21 4:21 ` [PATCH 0/3] md bitmap-based asynchronous writes Neil Brown 1 sibling, 1 reply; 71+ messages in thread From: Paul Clements @ 2005-03-17 20:53 UTC (permalink / raw) To: Neil Brown; +Cc: linux-raid [-- Attachment #1: Type: text/plain, Size: 126 bytes --] This patch enables the async write capability in md. It requires the md bitmap patches and the 117-WriteMostly-update patch. [-- Attachment #2: md_async_4_0_enabling.diff --] [-- Type: text/plain, Size: 8432 bytes --] Signed-Off-By: Paul Clements <paul.clements@steeleye.com> drivers/md/bitmap.c | 26 ++++++++++++++++++++++---- drivers/md/md.c | 13 +++++++++++++ include/linux/raid/bitmap.h | 17 +++++++++++------ include/linux/raid/md_k.h | 3 +++ 4 files changed, 49 insertions(+), 10 deletions(-) diff -purN --exclude core --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/drivers/md/bitmap.c linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/drivers/md/bitmap.c --- linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/drivers/md/bitmap.c Thu Mar 10 10:05:35 2005 +++ linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/drivers/md/bitmap.c Thu Mar 10 10:13:15 2005 @@ -379,6 +379,7 @@ void bitmap_print_sb(struct bitmap *bitm printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long) le64_to_cpu(sb->sync_size) / 2); + printk(KERN_DEBUG " async writes: %d\n", le32_to_cpu(sb->async_writes)); kunmap(bitmap->sb_page); } @@ -387,7 +388,7 @@ static int bitmap_read_sb(struct bitmap { char *reason = NULL; bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep; + unsigned long chunksize, daemon_sleep, async_writes; unsigned long bytes_read; unsigned long long events; int err = -EINVAL; @@ -411,6 +412,7 @@ static int bitmap_read_sb(struct bitmap chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep); + async_writes = le32_to_cpu(sb->async_writes); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -422,7 +424,9 @@ static int bitmap_read_sb(struct bitmap else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; else if (daemon_sleep < 1 || daemon_sleep > 15) - reason = "daemon sleep period out of range"; + reason = "daemon sleep period out of range (1-15s)"; + else if (async_writes > COUNTER_MAX) + reason = "async write limit of range (0 - 16383)"; if (reason) { printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", bmname(bitmap), reason); @@ -455,6 +459,7 @@ success: /* assign fields using values from superblock */ bitmap->chunksize = chunksize; bitmap->daemon_sleep = daemon_sleep; + bitmap->async_max_writes = async_writes; bitmap->flags |= sb->state; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); err = 0; @@ -1196,9 +1201,16 @@ static bitmap_counter_t *bitmap_get_coun } } -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int async) { if (!bitmap) return 0; + + if (async) { + atomic_inc(&bitmap->async_writes); + PRINTK(KERN_DEBUG "inc async write count %d/%d\n", + atomic_read(&bitmap->async_writes), bitmap->async_max_writes); + } + while (sectors) { int blocks; bitmap_counter_t *bmc; @@ -1233,9 +1245,15 @@ int bitmap_startwrite(struct bitmap *bit } void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success) + int success, int async) { if (!bitmap) return; + if (async) { + atomic_dec(&bitmap->async_writes); + PRINTK(KERN_DEBUG "dec async write count %d/%d\n", + atomic_read(&bitmap->async_writes), bitmap->async_max_writes); + } + while (sectors) { int blocks; unsigned long flags; diff -purN --exclude core --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/drivers/md/md.c linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/drivers/md/md.c --- linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/drivers/md/md.c Mon Feb 21 14:22:21 2005 +++ linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/drivers/md/md.c Fri Mar 4 13:37:57 2005 @@ -2166,6 +2166,8 @@ static int get_disk_info(mddev_t * mddev info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } + if (test_bit(WriteMostly, &rdev->flags)) + info.state |= (1<<MD_DISK_WRITEONLY); } else { info.major = info.minor = 0; info.raid_disk = -1; @@ -2411,6 +2413,10 @@ static int hot_add_disk(mddev_t * mddev, goto abort_unbind_export; } + if (mddev->bitmap && mddev->bitmap->async_max_writes) + /* array is async, hotadd = write only */ + set_bit(WriteMostly, &rdev->flags); + rdev->raid_disk = -1; md_update_sb(mddev); @@ -3290,6 +3296,8 @@ static int md_seq_show(struct seq_file * char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) + seq_printf(seq, "(W)"); if (rdev->faulty) { seq_printf(seq, "(F)"); continue; @@ -3339,6 +3347,11 @@ static int md_seq_show(struct seq_file * seq_printf(seq, "\n"); spin_unlock_irqrestore(&bitmap->lock, flags); kfree(buf); + if (bitmap->async_max_writes) + seq_printf(seq, + " async: %d/%ld outstanding writes\n", + atomic_read(&bitmap->async_writes), + bitmap->async_max_writes); } seq_printf(seq, "\n"); diff -purN --exclude core --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/include/linux/raid/bitmap.h linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/include/linux/raid/bitmap.h --- linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/include/linux/raid/bitmap.h Fri Feb 18 14:45:25 2005 +++ linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/include/linux/raid/bitmap.h Thu Mar 10 11:39:37 2005 @@ -6,8 +6,8 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAJOR 3 -#define BITMAP_MINOR 38 +#define BITMAP_MAJOR 4 +#define BITMAP_MINOR 0 /* * in-memory bitmap: @@ -147,8 +147,9 @@ typedef struct bitmap_super_s { __u32 state; /* 48 bitmap state information */ __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 async_writes; /* 60 number of outstanding async writes */ - __u8 pad[256 - 60]; /* set to zero */ + __u8 pad[256 - 64]; /* set to zero */ } bitmap_super_t; /* notes: @@ -225,6 +226,9 @@ struct bitmap { unsigned long flags; + unsigned long async_max_writes; /* asynchronous write mode */ + atomic_t async_writes; + /* * the bitmap daemon - periodically wakes up and sweeps the bitmap * file, cleaning up bits and flushing out pages to disk as necessary @@ -266,9 +270,10 @@ int bitmap_update_sb(struct bitmap *bitm int bitmap_setallbits(struct bitmap *bitmap); /* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success); +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int async); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int async); int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); diff -purN --exclude core --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/include/linux/raid/md_k.h linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/include/linux/raid/md_k.h --- linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/include/linux/raid/md_k.h Fri Feb 18 14:45:47 2005 +++ linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/include/linux/raid/md_k.h Wed Feb 23 15:47:30 2005 @@ -274,6 +274,9 @@ struct mddev_s atomic_t writes_pending; request_queue_t *queue; /* for plugging ... */ + atomic_t async_writes; /* outstanding async IO */ + unsigned int async_max_writes; /* 0 = sync */ + struct bitmap *bitmap; /* the bitmap for the device */ struct file *bitmap_file; /* the bitmap file */ ^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH 2/3] md bitmap async writes for raid1 2005-03-17 20:53 ` [PATCH 1/3] md bitmap async write enabling Paul Clements @ 2005-03-17 20:55 ` Paul Clements 2005-03-17 20:56 ` [PATCH 3/3] mdadm: bitmap async writes Paul Clements 0 siblings, 1 reply; 71+ messages in thread From: Paul Clements @ 2005-03-17 20:55 UTC (permalink / raw) To: Neil Brown; +Cc: linux-raid [-- Attachment #1: Type: text/plain, Size: 45 bytes --] This patch implements async writes in raid1. [-- Attachment #2: md_async_4_0_raid1.diff --] [-- Type: text/plain, Size: 7222 bytes --] Signed-Off-By: Paul Clements <paul.clements@steeleye.com> drivers/md/raid1.c | 107 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/raid/raid1.h | 2 2 files changed, 102 insertions(+), 7 deletions(-) diff -purN --exclude core --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/drivers/md/raid1.c linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/drivers/md/raid1.c --- linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/drivers/md/raid1.c Thu Mar 10 10:05:35 2005 +++ linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/drivers/md/raid1.c Sat Mar 12 08:38:20 2005 @@ -35,7 +35,7 @@ #include <linux/raid/raid1.h> #include <linux/raid/bitmap.h> -#define DEBUG 0 +#define DEBUG 0 #if DEBUG #define PRINTK(x...) printk(x) #else @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_ { struct bio *bio = r1_bio->master_bio; - bio_endio(bio, bio->bi_size, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_AsyncPhase, &r1_bio->state)) { + PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_sector, + (unsigned long long) bio->bi_sector + + (bio->bi_size >> 9) - 1); + + bio_endio(bio, bio->bi_size, + test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + } free_r1bio(r1_bio); } @@ -292,7 +301,7 @@ static int raid1_end_write_request(struc { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - int mirror; + int mirror, async, wonly = 1; /* assume write only if rdev missing */ conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) @@ -323,16 +332,39 @@ static int raid1_end_write_request(struc update_head_pos(mirror, r1_bio); + async = test_bit(R1BIO_AsyncIO, &r1_bio->state); + if (conf->mirrors[mirror].rdev) + wonly = test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags); + /* In async mode, we ACK the master bio once the I/O has safely + * reached the non-writeonly disk. Setting the AsyncPhase bit + * ensures that this gets done only once -- we don't ever want to + * return -EIO here, instead we'll wait */ + if (async && !wonly && test_bit(R1BIO_Uptodate, &r1_bio->state) && + !test_and_set_bit(R1BIO_AsyncPhase, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: async end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, mbio->bi_size, 0); + } /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + if (async) { + int i = bio->bi_vcnt; + /* free extra copy of the data pages */ + while (i--) + __free_page(bio->bi_io_vec[i].bv_page); + } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state)); + !test_bit(R1BIO_Degraded, &r1_bio->state), + async); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -553,6 +585,38 @@ static void device_barrier(conf_t *conf, spin_unlock_irq(&conf->resync_lock); } +/* duplicate the data pages for async I/O */ +static struct page **alloc_async_pages(struct bio *bio) +{ + int i; + struct bio_vec *bvec; + struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), + GFP_NOIO); + if (unlikely(!pages)) + goto do_sync_io; + + BUG_ON(bio->bi_idx != 0); + bio_for_each_segment(bvec, bio, i) { + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) + goto do_sync_io; + memcpy(kmap(pages[i]) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(pages[i]); + kunmap(bvec->bv_page); + } + + return pages; + +do_sync_io: + if (pages) + for (i = 0; i < bio->bi_vcnt && pages[i]; i++) + __free_page(pages[i]); + kfree(pages); + PRINTK("%dB async alloc failed, doing sync I/O\n", bio->bi_size); + return NULL; +} + static int make_request(request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; @@ -565,6 +629,7 @@ static int make_request(request_queue_t struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; + struct page **async_pages = NULL; /* @@ -668,6 +733,12 @@ static int make_request(request_queue_t set_bit(R1BIO_Degraded, &r1_bio->state); } + /* do async I/O ? */ + if (bitmap && + atomic_read(&bitmap->async_writes) < bitmap->async_max_writes && + (async_pages = alloc_async_pages(bio))) + set_bit(R1BIO_AsyncIO, &r1_bio->state); + atomic_set(&r1_bio->remaining, 0); bio_list_init(&bl); @@ -685,19 +756,30 @@ static int make_request(request_queue_t mbio->bi_rw = WRITE; mbio->bi_private = r1_bio; + if (test_bit(R1BIO_AsyncIO, &r1_bio->state)) { + struct bio_vec *bvec; + int j; + + BUG_ON(!async_pages); + bio_for_each_segment(bvec, mbio, j) + bvec->bv_page = async_pages[j]; + } + atomic_inc(&r1_bio->remaining); bio_list_add(&bl, mbio); } + kfree(async_pages); /* the async pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_AsyncIO, &r1_bio->state)); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); bio_list_init(&bl); blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); - + #if 0 while ((bio = bio_list_pop(&bl)) != NULL) generic_make_request(bio); @@ -1458,6 +1540,17 @@ out: static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + struct bitmap *bitmap = mddev->bitmap; + int async_wait = 0; + + /* wait for async writes to complete */ + while (bitmap && atomic_read(&bitmap->async_writes) > 0) { + async_wait++; + printk(KERN_INFO "raid1: async writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), async_wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); /* wait a second */ + /* need to kick something here to make sure I/O goes? */ + } md_unregister_thread(mddev->thread); mddev->thread = NULL; diff -purN --exclude core --exclude-from /export/public/clemep/tmp/dontdiff linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/include/linux/raid/raid1.h linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/include/linux/raid/raid1.h --- linux-2.6.11-rc3-mm2-patch-all-write-mostly-max-dev-bug-bitmap-bug-fix/include/linux/raid/raid1.h Fri Feb 18 14:45:25 2005 +++ linux-2.6.11-rc3-mm2-patch-all-write-mostly-async-write-bitmap-bug-fix/include/linux/raid/raid1.h Mon Feb 21 16:48:38 2005 @@ -107,4 +107,6 @@ struct r1bio_s { #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 +#define R1BIO_AsyncPhase 3 +#define R1BIO_AsyncIO 4 #endif ^ permalink raw reply [flat|nested] 71+ messages in thread
* [PATCH 3/3] mdadm: bitmap async writes 2005-03-17 20:55 ` [PATCH 2/3] md bitmap async writes for raid1 Paul Clements @ 2005-03-17 20:56 ` Paul Clements 0 siblings, 0 replies; 71+ messages in thread From: Paul Clements @ 2005-03-17 20:56 UTC (permalink / raw) To: Neil Brown; +Cc: linux-raid [-- Attachment #1: Type: text/plain, Size: 84 bytes --] This patch provides the write-mostly updates and async write capability for mdadm. [-- Attachment #2: mdadm_2_0_devel_1_async.diff --] [-- Type: text/plain, Size: 17358 bytes --] Signed-Off-By: Paul Clements <paul.clements@steeleye.com> Build.c | 4 ++-- Create.c | 13 +++++++++---- Detail.c | 3 +++ ReadMe.c | 2 ++ bitmap.c | 8 ++++++++ bitmap.h | 14 +++++++++++--- md_p.h | 5 +++++ mdadm.8 | 7 +++++++ mdadm.c | 31 ++++++++++++++++++++++++++++--- mdadm.h | 6 ++++-- super0.c | 8 +++++++- super1.c | 4 +++- 12 files changed, 89 insertions(+), 16 deletions(-) diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/Build.c mdadm-2.0-devel-1-async-writes/Build.c --- mdadm-2.0-devel-1-bitmap-bug-fix/Build.c Sun Feb 13 22:00:00 2005 +++ mdadm-2.0-devel-1-async-writes/Build.c Wed Mar 2 14:02:34 2005 @@ -36,7 +36,7 @@ int Build(char *mddev, int mdfd, int chunk, int level, int layout, int raiddisks, mddev_dev_t devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int delay) + char *bitmap_file, int bitmap_chunk, int async_writes, int delay) { /* Build a linear or raid0 arrays without superblocks * We cannot really do any checks, we just do it. @@ -185,7 +185,7 @@ int Build(char *mddev, int mdfd, int chu return 1; } if (CreateBitmap(bitmap_file, 1, NULL, bitmap_chunk, - delay, 0/* FIXME size */)) { + delay, async_writes, 0/* FIXME size */)) { return 1; } bitmap_fd = open(bitmap_file, O_RDWR); diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/Create.c mdadm-2.0-devel-1-async-writes/Create.c --- mdadm-2.0-devel-1-bitmap-bug-fix/Create.c Sun Feb 13 22:00:35 2005 +++ mdadm-2.0-devel-1-async-writes/Create.c Wed Mar 2 14:01:43 2005 @@ -35,7 +35,7 @@ int Create(struct supertype *st, char *m int chunk, int level, int layout, unsigned long size, int raiddisks, int sparedisks, int subdevs, mddev_dev_t devlist, int runstop, int verbose, int force, - char *bitmap_file, int bitmap_chunk, int delay) + char *bitmap_file, int bitmap_chunk, int async_writes, int delay) { /* * Create a new raid array. @@ -363,7 +363,8 @@ int Create(struct supertype *st, char *m if (bitmap_file) { int uuid[4]; st->ss->uuid_from_super(uuid, super); - if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk, delay, + if (CreateBitmap(bitmap_file, force, (char*)uuid, bitmap_chunk, + delay, async_writes, array.size*2ULL /* FIXME wrong for raid10 */)) { return 1; } @@ -397,14 +398,18 @@ int Create(struct supertype *st, char *m } disk.raid_disk = disk.number; if (disk.raid_disk < raiddisks) - disk.state = 6; /* active and in sync */ + disk.state = (1<<MD_DISK_ACTIVE) | + (1<<MD_DISK_SYNC); else disk.state = 0; + if (dnum && async_writes) + disk.state |= (1<<MD_DISK_WRITEONLY); + if (dnum == insert_point || strcasecmp(dv->devname, "missing")==0) { disk.major = 0; disk.minor = 0; - disk.state = 1; /* faulty */ + disk.state = (1<<MD_DISK_FAULTY); } else { fd = open(dv->devname, O_RDONLY, 0); if (fd < 0) { diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/Detail.c mdadm-2.0-devel-1-async-writes/Detail.c --- mdadm-2.0-devel-1-bitmap-bug-fix/Detail.c Sun Feb 13 21:59:45 2005 +++ mdadm-2.0-devel-1-async-writes/Detail.c Wed Mar 2 14:17:35 2005 @@ -213,6 +213,8 @@ int Detail(char *dev, int brief, int tes for (d= 0; d<MD_SB_DISKS; d++) { mdu_disk_info_t disk; char *dv; + int wonly = disk.state & (1<<MD_DISK_WRITEONLY); + disk.state &= ~(1<<MD_DISK_WRITEONLY); disk.number = d; if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { if (d < array.raid_disks) @@ -241,6 +243,7 @@ int Detail(char *dev, int brief, int tes if (disk.state & (1<<MD_DISK_ACTIVE)) printf(" active"); if (disk.state & (1<<MD_DISK_SYNC)) printf(" sync"); if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed"); + if (wonly) printf(" writeonly"); if (disk.state == 0) printf(" spare"); if (disk.state == 0) { if (is_26) { diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/ReadMe.c mdadm-2.0-devel-1-async-writes/ReadMe.c --- mdadm-2.0-devel-1-bitmap-bug-fix/ReadMe.c Thu Feb 17 19:17:13 2005 +++ mdadm-2.0-devel-1-async-writes/ReadMe.c Fri Mar 4 13:33:36 2005 @@ -131,6 +131,7 @@ struct option long_options[] = { {"metadata", 1, 0, 'e'}, /* superblock format */ {"bitmap", 1, 0, 'b'}, {"bitmap-chunk", 1, 0, 4}, + {"async", 2, 0, 5}, /* For assemble */ {"uuid", 1, 0, 'u'}, @@ -232,6 +233,7 @@ char OptionHelp[] = " --assume-clean : Assume the array is already in-sync. This is dangerous.\n" " --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" " --delay= -d : seconds between bitmap updates\n" +" --async= : number of simultaneous asynchronous writes to allow (requires bitmap)\n" "\n" " For assemble:\n" " --bitmap= -b : File to find bitmap information in\n" diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/bitmap.c mdadm-2.0-devel-1-async-writes/bitmap.c --- mdadm-2.0-devel-1-bitmap-bug-fix/bitmap.c Mon Mar 7 12:15:38 2005 +++ mdadm-2.0-devel-1-async-writes/bitmap.c Thu Mar 17 14:46:09 2005 @@ -204,6 +204,7 @@ int ExamineBitmap(char *filename, int br bitmap_super_t *sb; bitmap_info_t *info; int rv = 1; + char buf[64]; info = bitmap_file_read(filename, brief); if (!info) @@ -235,6 +236,11 @@ int ExamineBitmap(char *filename, int br printf(" State : %s\n", bitmap_state(sb->state)); printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); printf(" Daemon : %ds flush period\n", sb->daemon_sleep); + if (sb->async_writes) + sprintf(buf, "Asynchronous (%d)", sb->async_writes); + else + sprintf(buf, "Synchronous"); + printf(" Write Mode : %s\n", buf); printf(" Sync Size : %lluKB%s\n", sb->sync_size / 2, human_size(sb->sync_size * 512)); if (brief) @@ -249,6 +255,7 @@ free_info: int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, + unsigned long async_writes, unsigned long long array_size) { /* @@ -280,6 +287,7 @@ int CreateBitmap(char *filename, int for memcpy(sb.uuid, uuid, 16); sb.chunksize = chunksize; sb.daemon_sleep = daemon_sleep; + sb.async_writes = async_writes; sb.sync_size = array_size; sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */ diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/bitmap.h mdadm-2.0-devel-1-async-writes/bitmap.h --- mdadm-2.0-devel-1-bitmap-bug-fix/bitmap.h Thu Mar 17 14:37:15 2005 +++ mdadm-2.0-devel-1-async-writes/bitmap.h Mon Mar 14 10:13:36 2005 @@ -6,8 +6,8 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAJOR 3 -#define BITMAP_MINOR 38 +#define BITMAP_MAJOR 4 +#define BITMAP_MINOR 0 /* * in-memory bitmap: @@ -43,6 +43,13 @@ * When we set a bit, or in the counter (to start a write), if the fields is * 0, we first set the disk bit and set the counter to 1. * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block * counters as a fallback when "page" memory cannot be allocated: * @@ -140,8 +147,9 @@ typedef struct bitmap_super_s { __u32 state; /* 48 bitmap state information */ __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 async_writes; /* 60 number of outstanding async writes */ - __u8 pad[4096 - 60]; /* set to zero */ + __u8 pad[256 - 64]; /* set to zero */ } bitmap_super_t; /* notes: diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/md_p.h mdadm-2.0-devel-1-async-writes/md_p.h --- mdadm-2.0-devel-1-bitmap-bug-fix/md_p.h Thu Mar 17 14:36:32 2005 +++ mdadm-2.0-devel-1-async-writes/md_p.h Mon Mar 14 10:11:13 2005 @@ -79,6 +79,11 @@ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_WRITEONLY 9 /* disk is "write-only" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/mdadm.8 mdadm-2.0-devel-1-async-writes/mdadm.8 --- mdadm-2.0-devel-1-bitmap-bug-fix/mdadm.8 Thu Feb 17 19:26:05 2005 +++ mdadm-2.0-devel-1-async-writes/mdadm.8 Wed Mar 2 14:12:32 2005 @@ -204,6 +204,13 @@ exist). .BR --bitmap-chunk= Set the Chunksize of the bitmap. Each bit corresponds to that many Kilobytes of storage. Default is 4. + +.TP +.BR --async= +Specify that asynchronous write mode should be enabled (valid for RAID1 +only). If an argument is specified, it will set the maximum number +of outstanding asynchronous writes allowed. The default value is 256. +(A bitmap is required in order to use asynchronous write mode.) .TP diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/mdadm.c mdadm-2.0-devel-1-async-writes/mdadm.c --- mdadm-2.0-devel-1-bitmap-bug-fix/mdadm.c Sun Feb 13 22:01:51 2005 +++ mdadm-2.0-devel-1-async-writes/mdadm.c Wed Mar 2 15:24:54 2005 @@ -59,6 +59,7 @@ int main(int argc, char *argv[]) char devmode = 0; int runstop = 0; int readonly = 0; + int async_writes = 0; int bitmap_fd = -1; char *bitmap_file = NULL; int bitmap_chunk = UnSet; @@ -722,6 +723,19 @@ int main(int argc, char *argv[]) /* convert K to B, chunk of 0K means 512B */ bitmap_chunk = bitmap_chunk ? bitmap_chunk * 1024 : 512; continue; + + case O(BUILD, 5): + case O(CREATE, 5): /* asynchronous write mode */ + async_writes = DEFAULT_ASYNC_MAX_WRITES; + if (optarg) { + async_writes = strtol(optarg, &c, 10); + if (async_writes < 0 || *c || + async_writes > 16383) { + fprintf(stderr, Name ": Invalid value for maximum outstanding asynchronous writes: %s.\n\tMust be between 0 (i.e., fully synchronous) and 16383.\n", optarg); + exit(2); + } + } + continue; } /* We have now processed all the valid options. Anything else is * an error @@ -862,6 +876,12 @@ int main(int argc, char *argv[]) case BUILD: if (bitmap_chunk == UnSet) bitmap_chunk = DEFAULT_BITMAP_CHUNK; if (delay == 0) delay = DEFAULT_BITMAP_DELAY; + if (async_writes && !bitmap_file) { + fprintf(stderr, Name ": async write mode requires a bitmap.\n"); + rv = 1; + break; + } + if (bitmap_file) { bitmap_fd = open(bitmap_file, O_RDWR,0); if (bitmap_fd < 0 && errno != ENOENT) { @@ -871,16 +891,21 @@ int main(int argc, char *argv[]) } if (bitmap_fd < 0) { bitmap_fd = CreateBitmap(bitmap_file, force, NULL, - bitmap_chunk, delay, size); + bitmap_chunk, delay, async_writes, size); } } rv = Build(devlist->devname, mdfd, chunk, level, layout, raiddisks, devlist->next, assume_clean, - bitmap_file, bitmap_chunk, delay); + bitmap_file, bitmap_chunk, async_writes, delay); break; case CREATE: if (bitmap_chunk == UnSet) bitmap_chunk = DEFAULT_BITMAP_CHUNK; if (delay == 0) delay = DEFAULT_BITMAP_DELAY; + if (async_writes && !bitmap_file) { + fprintf(stderr, Name ": async write mode requires a bitmap.\n"); + rv = 1; + break; + } if (ss == NULL) { for(i=0; !ss && superlist[i]; i++) ss = superlist[i]->match_metadata_desc("default"); @@ -893,7 +918,7 @@ int main(int argc, char *argv[]) rv = Create(ss, devlist->devname, mdfd, chunk, level, layout, size<0 ? 0 : size, raiddisks, sparedisks, devs_found-1, devlist->next, runstop, verbose, force, - bitmap_file, bitmap_chunk, delay); + bitmap_file, bitmap_chunk, async_writes, delay); break; case MISC: diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/mdadm.h mdadm-2.0-devel-1-async-writes/mdadm.h --- mdadm-2.0-devel-1-bitmap-bug-fix/mdadm.h Sun Feb 13 22:00:00 2005 +++ mdadm-2.0-devel-1-async-writes/mdadm.h Wed Mar 2 14:24:19 2005 @@ -63,6 +63,7 @@ char *strncpy(char *dest, const char *sr #define DEFAULT_BITMAP_CHUNK 4096 #define DEFAULT_BITMAP_DELAY 5 +#define DEFAULT_ASYNC_MAX_WRITES 256 #include "md_u.h" #include "md_p.h" @@ -217,14 +218,14 @@ extern int Assemble(struct supertype *st extern int Build(char *mddev, int mdfd, int chunk, int level, int layout, int raiddisks, mddev_dev_t devlist, int assume_clean, - char *bitmap_file, int bitmap_chunk, int delay); + char *bitmap_file, int bitmap_chunk, int async_writes, int delay); extern int Create(struct supertype *st, char *mddev, int mdfd, int chunk, int level, int layout, unsigned long size, int raiddisks, int sparedisks, int subdevs, mddev_dev_t devlist, int runstop, int verbose, int force, - char *bitmap_file, int bitmap_chunk, int delay); + char *bitmap_file, int bitmap_chunk, int async_writes, int delay); extern int Detail(char *dev, int brief, int test); extern int Query(char *dev); @@ -239,6 +240,7 @@ extern int Kill(char *dev, int force); extern int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, + unsigned long async_writes, unsigned long long array_size); extern int ExamineBitmap(char *filename, int brief); diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/super0.c mdadm-2.0-devel-1-async-writes/super0.c --- mdadm-2.0-devel-1-bitmap-bug-fix/super0.c Mon Mar 7 13:27:38 2005 +++ mdadm-2.0-devel-1-async-writes/super0.c Mon Mar 14 10:14:05 2005 @@ -112,15 +112,19 @@ static void examine_super0(void *sbv) mdp_disk_t *dp; char *dv; char nb[5]; + int wonly; if (d>=0) dp = &sb->disks[d]; else dp = &sb->this_disk; sprintf(nb, "%4d", d); printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb, dp->number, dp->major, dp->minor, dp->raid_disk); + wonly = dp->state & (1<<MD_DISK_WRITEONLY); + dp->state &= ~(1<<MD_DISK_WRITEONLY); if (dp->state & (1<<MD_DISK_FAULTY)) printf(" faulty"); if (dp->state & (1<<MD_DISK_ACTIVE)) printf(" active"); if (dp->state & (1<<MD_DISK_SYNC)) printf(" sync"); if (dp->state & (1<<MD_DISK_REMOVED)) printf(" removed"); + if (wonly) printf(" writeonly"); if (dp->state == 0) printf(" spare"); if ((dv=map_dev(dp->major, dp->minor))) printf(" %s", dv); @@ -275,8 +279,10 @@ static int update_super0(struct mdinfo * } if (strcmp(update, "assemble")==0) { int d = info->disk.number; + int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEONLY); + sb->disks[d].state &= ~(1<<MD_DISK_WRITEONLY); if (sb->disks[d].state != info->disk.state) { - sb->disks[d].state = info->disk.state; + sb->disks[d].state = info->disk.state & wonly; rv = 1; } } diff -purN --exclude-from /export/public/clemep/tmp/dontdiff --exclude rpm --exclude mdadm.steeleye.spec --exclude *.KERNEL --exclude *.DIST mdadm-2.0-devel-1-bitmap-bug-fix/super1.c mdadm-2.0-devel-1-async-writes/super1.c --- mdadm-2.0-devel-1-bitmap-bug-fix/super1.c Mon Mar 7 11:34:16 2005 +++ mdadm-2.0-devel-1-async-writes/super1.c Thu Mar 10 11:55:54 2005 @@ -65,7 +66,9 @@ struct mdp_superblock_1 { __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ - __u8 pad2[64-56]; /* set to 0 when writing */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + __u8 pad2[64-57]; /* set to 0 when writing */ /* array state information - 64 bytes */ __u64 utime; /* 40 bits second, 24 btes microseconds */ ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 0/3] md bitmap-based asynchronous writes 2005-03-17 20:51 ` [PATCH 0/3] md bitmap-based asynchronous writes Paul Clements 2005-03-17 20:53 ` [PATCH 1/3] md bitmap async write enabling Paul Clements @ 2005-03-21 4:21 ` Neil Brown 2005-03-21 16:31 ` Paul Clements 1 sibling, 1 reply; 71+ messages in thread From: Neil Brown @ 2005-03-21 4:21 UTC (permalink / raw) To: Paul Clements; +Cc: linux-raid On Thursday March 17, paul.clements@steeleye.com wrote: > These three patches provide the ability to perform asynchronous writes > with raid1. The asynchronous write capability is primarily useful when > raid1 is employed in network replication (i.e., with one or more of the > disks located on a remote system). It allows us to acknowledge writes > before they reach the remote system, thus making much more efficient use > of the network. Thanks for these. However I would like to leave them until I'm completely happy with the bitmap resync, including - storing bitmap near superblock (I have this working) - hot-add bitmap (I've started on this) - support for raid5/6 and hopefully raid10. (still to come) and I would really like only-kick-drives-on-write-errors-not-read-errors to be the next priority. After that, I will definitely look at async writes. NeilBrown ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 0/3] md bitmap-based asynchronous writes 2005-03-21 4:21 ` [PATCH 0/3] md bitmap-based asynchronous writes Neil Brown @ 2005-03-21 16:31 ` Paul Clements 2005-03-21 22:09 ` Neil Brown 0 siblings, 1 reply; 71+ messages in thread From: Paul Clements @ 2005-03-21 16:31 UTC (permalink / raw) To: Neil Brown; +Cc: linux-raid Neil Brown wrote: > On Thursday March 17, paul.clements@steeleye.com wrote: > >>These three patches provide the ability to perform asynchronous writes >>with raid1. The asynchronous write capability is primarily useful when >>raid1 is employed in network replication (i.e., with one or more of the >>disks located on a remote system). It allows us to acknowledge writes >>before they reach the remote system, thus making much more efficient use >>of the network. > However I would like to leave them until I'm completely happy with the > bitmap resync, including > - storing bitmap near superblock (I have this working) Great. I assume you're using the 64k after the superblock for this (well, with v1 supers, I guess the bitmap could be just about anywhere?). So does this mean there would be multiple copies of the bitmap, or are you simply choosing one disk to which the bitmap is written? > - hot-add bitmap (I've started on this) I assume this means the ability to add a bitmap to an active array that has no bitmap? Might it also include the ability to modify the bitmap (from userland) while the array is active, as this functionality is desirable to have (perhaps you followed the thread where this was discussed?). > - support for raid5/6 and hopefully raid10. (still to come) That would be nice too. > and I would really like > only-kick-drives-on-write-errors-not-read-errors Perhaps I can help out with this. I've seen the recent discussions/patches for this, but so far it doesn't look like we have a completely up-to-date/working/tested patch against the latest kernel, do we? So maybe I could do this (unless someone else is already)? > to be the next priority. After that, I will definitely look at async > writes. Great. Thanks. -- Paul ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 0/3] md bitmap-based asynchronous writes 2005-03-21 16:31 ` Paul Clements @ 2005-03-21 22:09 ` Neil Brown 2005-03-22 8:35 ` Peter T. Breuer 0 siblings, 1 reply; 71+ messages in thread From: Neil Brown @ 2005-03-21 22:09 UTC (permalink / raw) To: Paul Clements; +Cc: linux-raid On Monday March 21, paul.clements@steeleye.com wrote: > > > However I would like to leave them until I'm completely happy with the > > bitmap resync, including > > - storing bitmap near superblock (I have this working) > > Great. I assume you're using the 64k after the superblock for this Yes. There is guaranteed to be at least 60K after the 4K superblock. The default is to choose a bitmap_chunk size to use between 30K and 60K of the bitmap. My test machine with 35Gig drives uses a 128KB chunk size. > (well, with v1 supers, I guess the bitmap could be just about > anywhere?). Yes, which makes hot-adding a bit awkward... I think the default --create should leave a bit of space just in case. > So does this mean there would be multiple copies of the > bitmap, or are you simply choosing one disk to which the bitmap is written? Multiple copies, on all active drives (not on spares). > > > - hot-add bitmap (I've started on this) > > I assume this means the ability to add a bitmap to an active array that > has no bitmap? Might it also include the ability to modify the bitmap > (from userland) while the array is active, as this functionality is > desirable to have (perhaps you followed the thread where this was > discussed?). It would definitely include the ability to remove a bitmap and then add a new one which would nearly achieve the same thing (just a small window of no bitmap). I guess for file-backed bitmaps, an atomic switch should be possible and desirable. I'll see what I can do. > > > - support for raid5/6 and hopefully raid10. (still to come) > > That would be nice too. > > > and I would really like > > only-kick-drives-on-write-errors-not-read-errors > > Perhaps I can help out with this. I've seen the recent > discussions/patches for this, but so far it doesn't look like we have a > completely up-to-date/working/tested patch against the latest kernel, do > we? So maybe I could do this (unless someone else is already)? I cannot remember what I thought of those patches. I would definitely go back and review them before starting on this functionality for raid1. However I want to do raid5 first. I think it would be much easier because of the stripe cache. Any 'stripe' with a bad read would be flagged, kept in the cache and just get processed a different way to other stripes. For raid1, you need some extra data structure to keep track of which blocks have seen bad reads, so I would rather leave that until I have familiarity with other issues through raid5. NeilBrown ^ permalink raw reply [flat|nested] 71+ messages in thread
* Re: [PATCH 0/3] md bitmap-based asynchronous writes 2005-03-21 22:09 ` Neil Brown @ 2005-03-22 8:35 ` Peter T. Breuer 0 siblings, 0 replies; 71+ messages in thread From: Peter T. Breuer @ 2005-03-22 8:35 UTC (permalink / raw) To: linux-raid Neil Brown <neilb@cse.unsw.edu.au> wrote: > However I want to do raid5 first. I think it would be much easier > because of the stripe cache. Any 'stripe' with a bad read would be There's the FR5 patch (fr5.sf.net) which adds a bitmap to raid5. It doesn't do "robust read" for raid5, however. > flagged, kept in the cache and just get processed a different way to > other stripes. For raid1, you need some extra data structure to keep > track of which blocks have seen bad reads, so I would rather leave One can simply count the untried disks using the "remaining" field, which is otherwise unused on the read side. The count will be accurate unless one changes the number of disks _during_ the read retry, and then the worst that can happen is that one misses out on trying to retry the read from the new disk, which surely will not yet be up to date anyway. Or one can always try to read "one more for luck". > that until I have familiarity with other issues through raid5. I would love to know of _any_ starting point you see to doing it in raid5. It was not clear to me how to. Peter ^ permalink raw reply [flat|nested] 71+ messages in thread
end of thread, other threads:[~2005-03-25 20:56 UTC | newest] Thread overview: 71+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-03-09 22:18 [PATCH 1/2] md bitmap bug fixes Paul Clements 2005-03-09 22:19 ` [PATCH 2/2] " Paul Clements 2005-03-14 4:43 ` [PATCH 1/2] " Neil Brown 2005-03-14 9:44 ` Lars Marowsky-Bree 2005-03-14 10:22 ` Neil Brown 2005-03-14 11:24 ` Lars Marowsky-Bree 2005-03-14 22:54 ` Neil Brown 2005-03-18 10:33 ` Lars Marowsky-Bree 2005-03-18 12:52 ` Peter T. Breuer 2005-03-18 13:42 ` Lars Marowsky-Bree 2005-03-18 14:50 ` Peter T. Breuer 2005-03-18 17:03 ` Paul Clements 2005-03-18 18:43 ` Peter T. Breuer 2005-03-18 19:01 ` Mario Holbe 2005-03-18 19:33 ` Peter T. Breuer 2005-03-18 20:24 ` Mario Holbe 2005-03-18 21:01 ` Andy Smith 2005-03-19 11:43 ` Peter T. Breuer 2005-03-19 12:58 ` Lars Marowsky-Bree 2005-03-19 13:27 ` Peter T. Breuer 2005-03-19 14:07 ` Lars Marowsky-Bree 2005-03-19 15:06 ` Peter T. Breuer 2005-03-19 15:24 ` Mario Holbe 2005-03-19 15:58 ` Peter T. Breuer 2005-03-19 16:24 ` Lars Marowsky-Bree 2005-03-19 17:19 ` Peter T. Breuer 2005-03-19 17:36 ` Lars Marowsky-Bree 2005-03-19 17:44 ` Guy 2005-03-19 17:54 ` Lars Marowsky-Bree 2005-03-19 18:05 ` Guy 2005-03-19 20:29 ` berk walker 2005-03-19 18:11 ` Peter T. Breuer 2005-03-18 19:43 ` Paul Clements 2005-03-19 12:10 ` Peter T. Breuer 2005-03-21 16:07 ` Paul Clements 2005-03-21 18:56 ` Luca Berra 2005-03-21 19:58 ` Paul Clements 2005-03-21 20:45 ` Peter T. Breuer 2005-03-21 21:09 ` Gil 2005-03-21 21:19 ` Paul Clements 2005-03-21 22:15 ` Peter T. Breuer 2005-03-22 22:35 ` Peter T. Breuer 2005-03-21 21:32 ` Guy 2005-03-22 9:35 ` Luca Berra 2005-03-22 10:02 ` Peter T. Breuer 2005-03-23 20:31 ` Luca Berra 2005-03-25 18:51 ` Peter T. Breuer 2005-03-25 20:54 ` berk walker 2005-03-25 20:56 ` berk walker 2005-03-18 17:16 ` Luca Berra 2005-03-18 17:57 ` Lars Marowsky-Bree 2005-03-18 21:46 ` Michael Tokarev 2005-03-19 9:05 ` Lars Marowsky-Bree 2005-03-19 12:16 ` Peter T. Breuer 2005-03-19 12:34 ` Michael Tokarev 2005-03-19 12:53 ` Peter T. Breuer 2005-03-19 16:08 ` "Robust Read" (was: [PATCH 1/2] md bitmap bug fixes) Michael Tokarev 2005-03-19 17:03 ` "Robust Read" Peter T. Breuer 2005-03-19 20:20 ` Michael Tokarev 2005-03-19 20:56 ` Peter T. Breuer 2005-03-19 22:05 ` Michael Tokarev 2005-03-19 22:30 ` Peter T. Breuer 2005-03-15 4:24 ` [PATCH 1/2] md bitmap bug fixes Paul Clements 2005-03-17 20:51 ` [PATCH 0/3] md bitmap-based asynchronous writes Paul Clements 2005-03-17 20:53 ` [PATCH 1/3] md bitmap async write enabling Paul Clements 2005-03-17 20:55 ` [PATCH 2/3] md bitmap async writes for raid1 Paul Clements 2005-03-17 20:56 ` [PATCH 3/3] mdadm: bitmap async writes Paul Clements 2005-03-21 4:21 ` [PATCH 0/3] md bitmap-based asynchronous writes Neil Brown 2005-03-21 16:31 ` Paul Clements 2005-03-21 22:09 ` Neil Brown 2005-03-22 8:35 ` Peter T. Breuer
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).