From: Namhyung Kim <namhyung@gmail.com>
To: NeilBrown <neilb@suse.de>
Cc: linux-raid@vger.kernel.org
Subject: Re: [md PATCH 04/36] md: load/store badblock list from v1.x metadata
Date: Sat, 23 Jul 2011 01:34:47 +0900 [thread overview]
Message-ID: <87vcuu46l4.fsf@gmail.com> (raw)
In-Reply-To: <20110721025847.8422.27295.stgit@notabene.brown> (NeilBrown's message of "Thu, 21 Jul 2011 12:58:47 +1000")
NeilBrown <neilb@suse.de> writes:
> Space must have been allocated when array was created.
> A feature flag is set when the badblock list is non-empty, to
> ensure old kernels don't load and trust the whole device.
>
> We only update the on-disk badblocklist when it has changed.
> If the badblocklist (or other metadata) is stored on a bad block, we
> don't cope very well.
>
> If metadata has no room for bad block, flag bad-blocks as disabled,
> and do the same for 0.90 metadata.
>
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>
> drivers/md/md.c | 111 +++++++++++++++++++++++++++++++++++++++++++--
> drivers/md/md.h | 5 ++
> include/linux/raid/md_p.h | 14 ++++--
> 3 files changed, 119 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 9324635..18c3aab 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -757,6 +757,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
> rdev->sb_start = 0;
> rdev->sectors = 0;
> }
> + if (rdev->bb_page) {
> + put_page(rdev->bb_page);
> + rdev->bb_page = NULL;
> + }
> }
>
>
> @@ -1395,6 +1399,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
> return cpu_to_le32(csum);
> }
>
> +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
> + int acknowledged);
> static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
> {
> struct mdp_superblock_1 *sb;
> @@ -1473,6 +1479,47 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
> else
> rdev->desc_nr = le32_to_cpu(sb->dev_number);
>
> + if (!rdev->bb_page) {
> + rdev->bb_page = alloc_page(GFP_KERNEL);
> + if (!rdev->bb_page)
> + return -ENOMEM;
> + }
This will allocate ->bb_page's for unsupported arrays too. Checking
->bblog_offset here might be helpful.
> + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
> + rdev->badblocks.count == 0) {
> + /* need to load the bad block list.
> + * Currently we limit it to one page.
> + */
> + s32 offset;
> + sector_t bb_sector;
> + u64 *bbp;
> + int i;
> + int sectors = le16_to_cpu(sb->bblog_size);
> + if (sectors > (PAGE_SIZE / 512))
> + return -EINVAL;
> + offset = le32_to_cpu(sb->bblog_offset);
> + if (offset == 0)
> + return -EINVAL;
> + bb_sector = (long long)offset;
> + if (!sync_page_io(rdev, bb_sector, sectors << 9,
> + rdev->bb_page, READ, true))
> + return -EIO;
> + bbp = (u64 *)page_address(rdev->bb_page);
Unnecessary cast.
> + rdev->badblocks.shift = sb->bblog_shift;
> + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
> + u64 bb = le64_to_cpu(*bbp);
> + int count = bb & (0x3ff);
> + u64 sector = bb >> 10;
> + sector <<= sb->bblog_shift;
> + count <<= sb->bblog_shift;
> + if (bb + 1 == 0)
> + break;
This code probably needs comment.
> + if (md_set_badblocks(&rdev->badblocks,
> + sector, count, 1) == 0)
> + return -EINVAL;
> + }
> + } else if (sb->bblog_offset == 0)
> + rdev->badblocks.shift = -1;
->badblocks.page can be freed as well.
> +
> if (!refdev) {
> ret = 1;
> } else {
> @@ -1624,7 +1671,6 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
> sb->pad0 = 0;
> sb->recovery_offset = cpu_to_le64(0);
> memset(sb->pad1, 0, sizeof(sb->pad1));
> - memset(sb->pad2, 0, sizeof(sb->pad2));
> memset(sb->pad3, 0, sizeof(sb->pad3));
>
> sb->utime = cpu_to_le64((__u64)mddev->utime);
> @@ -1664,6 +1710,43 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
> sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
> }
>
> + if (rdev->badblocks.count == 0)
> + /* Nothing to do for bad blocks*/ ;
> + else if (sb->bblog_offset == 0)
> + /* Cannot record bad blocks on this device */
> + md_error(mddev, rdev);
> + else {
> + int havelock = 0;
> + struct badblocks *bb = &rdev->badblocks;
> + u64 *bbp = (u64 *)page_address(rdev->bb_page);
Unnecessary cast too.
> + u64 *p;
> + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
> + if (bb->changed) {
> + memset(bbp, 0xff, PAGE_SIZE);
> +
> + rcu_read_lock();
> + p = rcu_dereference(bb->active_page);
> + if (!p) {
> + spin_lock_irq(&bb->lock);
> + p = bb->page;
> + havelock = 1;
> + }
> + for (i = 0 ; i < bb->count ; i++) {
> + u64 internal_bb = *p++;
> + u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
> + | BB_LEN(internal_bb));
> + *bbp++ = cpu_to_le64(store_bb);
> + }
> + bb->sector = (rdev->sb_start +
> + (int)le32_to_cpu(sb->bblog_offset));
> + bb->size = le16_to_cpu(sb->bblog_size);
> + bb->changed = 0;
> + if (havelock)
> + spin_unlock_irq(&bb->lock);
> + rcu_read_unlock();
> + }
> + }
> +
> max_dev = 0;
> list_for_each_entry(rdev2, &mddev->disks, same_set)
> if (rdev2->desc_nr+1 > max_dev)
> @@ -2197,6 +2280,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
> mdk_rdev_t *rdev;
> int sync_req;
> int nospares = 0;
> + int any_badblocks_changed = 0;
>
> repeat:
> /* First make sure individual recovery_offsets are correct */
> @@ -2268,6 +2352,11 @@ repeat:
> MD_BUG();
> mddev->events --;
> }
> +
> + list_for_each_entry(rdev, &mddev->disks, same_set)
> + if (rdev->badblocks.changed)
> + any_badblocks_changed++;
> +
> sync_sbs(mddev, nospares);
> spin_unlock_irq(&mddev->write_lock);
>
> @@ -2293,6 +2382,13 @@ repeat:
> bdevname(rdev->bdev,b),
> (unsigned long long)rdev->sb_start);
> rdev->sb_events = mddev->events;
> + if (rdev->badblocks.size) {
> + md_super_write(mddev, rdev,
> + rdev->badblocks.sector,
> + rdev->badblocks.size << 9,
> + rdev->bb_page);
> + rdev->badblocks.size = 0;
> + }
>
> } else
> dprintk(")\n");
> @@ -2316,6 +2412,9 @@ repeat:
> if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
> sysfs_notify(&mddev->kobj, NULL, "sync_completed");
>
> + if (any_badblocks_changed)
> + list_for_each_entry(rdev, &mddev->disks, same_set)
> + md_ack_all_badblocks(&rdev->badblocks);
> }
>
> /* words written to sysfs files may, or may not, be \n terminated.
> @@ -2823,6 +2922,8 @@ int md_rdev_init(mdk_rdev_t *rdev)
> rdev->sb_events = 0;
> rdev->last_read_error.tv_sec = 0;
> rdev->last_read_error.tv_nsec = 0;
> + rdev->sb_loaded = 0;
> + rdev->bb_page = NULL;
> atomic_set(&rdev->nr_pending, 0);
> atomic_set(&rdev->read_errors, 0);
> atomic_set(&rdev->corrected_errors, 0);
> @@ -2912,11 +3013,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
> return rdev;
>
> abort_free:
> - if (rdev->sb_page) {
> - if (rdev->bdev)
> - unlock_rdev(rdev);
> - free_disk_sb(rdev);
> - }
> + if (rdev->bdev)
> + unlock_rdev(rdev);
> + free_disk_sb(rdev);
> kfree(rdev);
> return ERR_PTR(err);
> }
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index d327734..834e46b 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -55,7 +55,7 @@ struct mdk_rdev_s
> struct block_device *meta_bdev;
> struct block_device *bdev; /* block device handle */
>
> - struct page *sb_page;
> + struct page *sb_page, *bb_page;
> int sb_loaded;
> __u64 sb_events;
> sector_t data_offset; /* start of data in array */
> @@ -128,6 +128,9 @@ struct mdk_rdev_s
> u64 *active_page; /* either 'page' or 'NULL' */
> int changed;
> spinlock_t lock;
> +
> + sector_t sector;
> + sector_t size; /* in sectors */
Looks like 'int' is sufficient for 'size'. Anyway md_super_write()
treats it as int.
> } badblocks;
> };
>
> diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
> index 75cbf4f..9e65d9e 100644
> --- a/include/linux/raid/md_p.h
> +++ b/include/linux/raid/md_p.h
> @@ -245,10 +245,16 @@ struct mdp_superblock_1 {
> __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
> __u8 devflags; /* per-device flags. Only one defined...*/
> #define WriteMostly1 1 /* mask for writemostly flag in above */
> - __u8 pad2[64-57]; /* set to 0 when writing */
> + /* Bad block log. If there are any bad blocks the feature flag is set.
> + * If offset and size are non-zero, that space is reserved and available
> + */
> + __u8 bblog_shift; /* shift from sectors to block size */
> + __le16 bblog_size; /* number of sectors reserved for list */
> + __le32 bblog_offset; /* sector offset from superblock to bblog,
> + * signed - not unsigned */
>
> /* array state information - 64 bytes */
> - __le64 utime; /* 40 bits second, 24 btes microseconds */
> + __le64 utime; /* 40 bits second, 24 bits microseconds */
> __le64 events; /* incremented when superblock updated */
> __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */
> __le32 sb_csum; /* checksum up to devs[max_dev] */
> @@ -270,8 +276,8 @@ struct mdp_superblock_1 {
> * must be honoured
> */
> #define MD_FEATURE_RESHAPE_ACTIVE 4
> +#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
>
> -#define MD_FEATURE_ALL (1|2|4)
> +#define MD_FEATURE_ALL (1|2|4|8)
>
> #endif
> -
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2011-07-22 16:34 UTC|newest]
Thread overview: 65+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-07-21 2:58 [md PATCH 00/36] md patches for 3.1 - part 2: bad block logs NeilBrown
2011-07-21 2:58 ` [md PATCH 04/36] md: load/store badblock list from v1.x metadata NeilBrown
2011-07-22 16:34 ` Namhyung Kim [this message]
2011-07-21 2:58 ` [md PATCH 03/36] md: don't allow arrays to contain devices with bad blocks NeilBrown
2011-07-22 15:47 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 02/36] md/bad-block-log: add sysfs interface for accessing bad-block-log NeilBrown
2011-07-22 15:43 ` Namhyung Kim
2011-07-26 2:29 ` NeilBrown
2011-07-26 5:17 ` Namhyung Kim
2011-07-26 8:48 ` Namhyung Kim
2011-07-26 15:03 ` [PATCH v2] md: add documentation for bad block log Namhyung Kim
2011-07-27 1:05 ` [md PATCH 02/36] md/bad-block-log: add sysfs interface for accessing bad-block-log NeilBrown
2011-07-21 2:58 ` [md PATCH 01/36] md: beginnings of bad block management NeilBrown
2011-07-22 15:03 ` Namhyung Kim
2011-07-26 2:26 ` NeilBrown
2011-07-26 5:17 ` Namhyung Kim
2011-07-22 16:52 ` Namhyung Kim
2011-07-26 3:20 ` NeilBrown
2011-07-21 2:58 ` [md PATCH 06/36] md/raid1: avoid reading from known bad blocks NeilBrown
2011-07-26 14:06 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 11/36] md/raid1: clear bad-block record when write succeeds NeilBrown
2011-07-27 5:05 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 12/36] md/raid1: store behind-write pages in bi_vecs NeilBrown
2011-07-27 15:16 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 09/36] md: make it easier to wait for bad blocks to be acknowledged NeilBrown
2011-07-26 16:04 ` Namhyung Kim
2011-07-27 1:18 ` NeilBrown
2011-07-21 2:58 ` [md PATCH 10/36] md/raid1: avoid writing to known-bad blocks on known-bad drives NeilBrown
2011-07-27 4:09 ` Namhyung Kim
2011-07-27 4:19 ` NeilBrown
2011-07-21 2:58 ` [md PATCH 05/36] md: Disable bad blocks and v0.90 metadata NeilBrown
2011-07-22 17:02 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 14/36] md/raid1: record badblocks found during resync etc NeilBrown
2011-07-27 15:39 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 07/36] md/raid1: avoid reading known bad blocks during resync NeilBrown
2011-07-26 14:25 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 13/36] md/raid1: Handle write errors by updating badblock log NeilBrown
2011-07-27 15:28 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 08/36] md: add 'write_error' flag to component devices NeilBrown
2011-07-26 15:22 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 17/36] md/raid5: avoid reading from known bad blocks NeilBrown
2011-07-21 2:58 ` [md PATCH 15/36] md/raid1: improve handling of read failure during recovery NeilBrown
2011-07-27 15:45 ` Namhyung Kim
2011-07-21 2:58 ` [md PATCH 21/36] md/raid5: Clear bad blocks on successful write NeilBrown
2011-07-21 2:58 ` [md PATCH 20/36] md/raid5. Don't write to known bad block on doubtful devices NeilBrown
2011-07-21 2:58 ` [md PATCH 22/36] md/raid10: simplify/reindent some loops NeilBrown
2011-07-21 2:58 ` [md PATCH 18/36] md/raid5: use bad-block log to improve handling of uncorrectable read errors NeilBrown
2011-07-21 2:58 ` [md PATCH 23/36] md/raid10: Split handle_read_error out from raid10d NeilBrown
2011-07-21 2:58 ` [md PATCH 19/36] md/raid5: write errors should be recorded as bad blocks if possible NeilBrown
2011-07-21 2:58 ` [md PATCH 16/36] md/raid1: factor several functions out or raid1d() NeilBrown
2011-07-27 15:55 ` Namhyung Kim
2011-07-28 1:39 ` NeilBrown
2011-07-21 2:58 ` [md PATCH 24/36] md/raid10: avoid reading from known bad blocks - part 1 NeilBrown
2011-07-21 2:58 ` [md PATCH 30/36] md/raid10: clear bad-block record when write succeeds NeilBrown
2011-07-21 2:58 ` [md PATCH 34/36] md/raid10: simplify read error handling during recovery NeilBrown
2011-07-21 2:58 ` [md PATCH 25/36] md/raid10: avoid reading from known bad blocks - part 2 NeilBrown
2011-07-21 2:58 ` [md PATCH 28/36] md/raid10 record bad blocks as needed during recovery NeilBrown
2011-07-21 2:58 ` [md PATCH 31/36] md/raid10: Handle write errors by updating badblock log NeilBrown
2011-07-21 2:58 ` [md PATCH 26/36] md/raid10 - avoid reading from known bad blocks - part 3 NeilBrown
2011-07-21 2:58 ` [md PATCH 32/36] md/raid10: attempt to fix read errors during resync/check NeilBrown
2011-07-21 2:58 ` [md PATCH 27/36] md/raid10: avoid reading known bad blocks during resync/recovery NeilBrown
2011-07-21 2:58 ` [md PATCH 29/36] md/raid10: avoid writing to known bad blocks on known bad drives NeilBrown
2011-07-21 2:58 ` [md PATCH 33/36] md/raid10: record bad blocks due to write errors during resync/recovery NeilBrown
2011-07-21 2:58 ` [md PATCH 35/36] md/raid10: Handle read errors during recovery better NeilBrown
2011-07-21 2:58 ` [md PATCH 36/36] md/raid10: handle further errors during fix_read_error better NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87vcuu46l4.fsf@gmail.com \
--to=namhyung@gmail.com \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).