From: NeilBrown <neilb@suse.de>
To: Jonathan Brassow <jbrassow@redhat.com>
Cc: linux-raid@vger.kernel.org
Subject: Re: [PATCH 7 of 9] MD: new sb type
Date: Wed, 25 May 2011 14:16:45 +1000 [thread overview]
Message-ID: <20110525141645.24e8fe29@notabene.brown> (raw)
In-Reply-To: <201105240307.p4O374rN029659@f14.redhat.com>
On Mon, 23 May 2011 22:07:04 -0500 Jonathan Brassow <jbrassow@f14.redhat.com>
wrote:
> Patch name: md-new-sb-type.patch
>
> A new MD superblock that is device-mapper specific.
>
> The new superblock is not read or written from userspace and is not exported.
> It contains information to track resync, recovery, and reshaping progress. It
> also maintains information on the health of the devices in the array.
>
> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
>
> Index: linux-2.6/drivers/md/md.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/md.c
> +++ linux-2.6/drivers/md/md.c
> @@ -1731,6 +1731,305 @@ super_1_rdev_size_change(mdk_rdev_t *rde
> return num_sectors;
> }
>
> +/*
> + * This structure is never used by userspace. It is only ever
> + * used in these particular super block accessing functions.
> + * Therefore, we don't put it in any .h file.
> + *
> + * It makes sense to define a new magic number here. This way,
> + * no userspace application will confuse the device as a device
> + * that is accessible through MD operations. Devices with this
> + * superblock should only ever be accessed via device-mapper.
> + */
> +#define MD_DM_SB_MAGIC 0x426E6F4A
> +struct mdp_superblock_2 {
> + __le32 magic;
> + __le32 flags; /* Used to indicate possible future changes */
> +
> + __le64 events;
> +
> + /*
> + * The following offset variables are used to indicate:
> + * reshape_offset: If the RAID level or layout of an array is
> + * being updated, this offset keeps track of the
> + * progress.
> + * disk_recovery_offset: If drives are being repaired/replaced on
> + * an individual basis, this offset tracks
> + * that progress. This might happen when a
> + * drive fails and is replaced.
> + * array_resync_offset: When the array is constructed for the first
> + * time, all the devices must be made coherent.
> + * This offset tracks that progress.
> + */
> + __le64 reshape_offset;
> + __le64 disk_recovery_offset;
> + __le64 array_resync_offset;
> +
> + /*
> + * The following variable pairs reflect things
> + * that can changed during an array reshape.
> + */
> + __le32 level;
> + __le32 new_level;
> +
> + __le32 layout;
> + __le32 new_layout;
> +
> + __le32 stripe_sectors;
> + __le32 new_stripe_sectors;
> +
> + __le32 num_devices; /* Number of devs in RAID, Max = 64 */
> + __le32 new_num_devices;
Presumably the dm table knows all this info as well and it is just here for
error checking - yes?
> +
> + __le64 failed_devices; /* bitmap of devs, used to indicate a failure */
> + __u8 pad[432]; /* Round out the struct to 512 bytes */
> +};
> +
> +static void super_2_sync(mddev_t *mddev, mdk_rdev_t *rdev)
> +{
> + mdk_rdev_t *r, *t;
> + uint64_t failed_devices;
> + struct mdp_superblock_2 *sb;
> +
> + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> + failed_devices = le32_to_cpu(sb->failed_devices);
failed_devices is 64 bit, so you want le64_to_cpu
> +
> + rdev_for_each(r, t, mddev)
> + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
> + failed_devices |= (1 << r->raid_disk);
And this should be (1ULL << ....) so that it doesn't overflow.
> +
> + memset(sb, 0, sizeof(*sb));
> +
> + sb->magic = cpu_to_le32(MD_DM_SB_MAGIC);
> + sb->flags = cpu_to_le32(0); /* No flags yet */
> +
> + sb->events = cpu_to_le64(mddev->events);
> +
> + sb->reshape_offset = cpu_to_le64(mddev->reshape_position);
> + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
> + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
> +
> + sb->level = cpu_to_le32(mddev->level);
> + sb->layout = cpu_to_le32(mddev->layout);
> + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
> + sb->num_devices = cpu_to_le32(mddev->raid_disks);
> +
> + if (mddev->reshape_position != MaxSector) {
> + sb->new_level = cpu_to_le32(mddev->new_level);
> + sb->new_layout = cpu_to_le32(mddev->new_layout);
> + sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
> + sb->new_num_devices = cpu_to_le32(mddev->delta_disks);
> + } else {
> + sb->new_level = 0;
> + sb->new_layout = 0;
> + sb->new_stripe_sectors = 0;
> + sb->new_num_devices = 0;
> + }
As these values are meaningless when reshape_position is MaxSector, and as
the structure has already been zeroed, setting them to zero again looks wrong.
> +
> + sb->failed_devices = cpu_to_le32(failed_devices);
Again, cpu_to_le64
I haven't thought through the 'FirstUse and STATE_FORCED flags yet. When I
have I might have more to say - or I might not.
Thanks,
NeilBrown
> +}
> +
> +/*
> + * super_2_load
> + *
> + * This function creates a superblock if one is not found on the device
> + * and will indicate the more appropriate device whose superblock should
> + * be used, if given two.
> + *
> + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
> + */
> +static int super_2_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
> +{
> + int r;
> + uint64_t ev1, ev2;
> + struct mdp_superblock_2 *sb;
> + struct mdp_superblock_2 *refsb;
> +
> + if (sizeof(*sb) & (sizeof(*sb) - 1)) {
> + printk(KERN_ERR "Programmer error: Bad sized superblock (%lu)\n",
> + sizeof(*sb));
> + return -EIO;
> + }
> +
> + rdev->sb_start = 0;
> + rdev->sb_size = sizeof(*sb);
> + r = read_disk_sb(rdev, rdev->sb_size);
> + if (r)
> + return r;
> +
> + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> + if (sb->magic != cpu_to_le32(MD_DM_SB_MAGIC)) {
> + super_2_sync(rdev->mddev, rdev);
> +
> + set_bit(FirstUse, &rdev->flags);
> +
> + /* Force new superblocks to disk */
> + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
> +
> + /* Any superblock is better than none, choose that if given */
> + return refdev ? 0 : 1;
> + }
> +
> + if (!refdev)
> + return 1;
> +
> + ev1 = le64_to_cpu(sb->events);
> + refsb = (struct mdp_superblock_2 *)page_address(refdev->sb_page);
> + ev2 = le64_to_cpu(refsb->events);
> +
> + return (ev1 > ev2) ? 1 : 0;
> +}
> +
> +static int super_2_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
> +{
> + uint64_t ev1;
> + uint32_t failed_devices;
> + struct mdp_superblock_2 *sb;
> + uint32_t new_devs = 0;
> + uint32_t rebuilds = 0;
> + mdk_rdev_t *r, *t;
> + struct mdp_superblock_2 *sb2;
> +
> + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> + ev1 = le64_to_cpu(sb->events);
> + failed_devices = le32_to_cpu(sb->failed_devices);
> +
> + mddev->events = ev1 ? ev1 : 1;
> +
> + /* Reshaping is not currently allowed */
> + if ((le32_to_cpu(sb->level) != mddev->level) ||
> + (le32_to_cpu(sb->layout) != mddev->layout) ||
> + (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) ||
> + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
> + printk(KERN_ERR
> + "md: %s: Reshaping arrays not yet supported.\n",
> + mdname(mddev));
> + return -EINVAL;
> + }
> +
> + if (!test_and_clear_bit(MD_SYNC_STATE_FORCED, &mddev->flags))
> + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
> +
> + /*
> + * During load, we set FirstUse if a new superblock was written.
> + * There are two reasons we might not have a superblock:
> + * 1) The array is brand new - in which case, all of the
> + * devices must have their In_sync bit set. Also,
> + * recovery_cp must be 0, unless forced.
> + * 2) This is a new device being added to an old array
> + * and the new device needs to be rebuilt - in which
> + * case the In_sync bit will /not/ be set and
> + * recovery_cp must be MaxSector.
> + */
> + rdev_for_each(r, t, mddev) {
> + if (!test_bit(In_sync, &r->flags)) {
> + if (!test_bit(FirstUse, &r->flags))
> + printk(KERN_ERR "md: %s: Superblock area of "
> + "rebuild device %d should have been "
> + "cleared.\n", mdname(mddev),
> + r->raid_disk);
> + set_bit(FirstUse, &r->flags);
> + rebuilds++;
> + } else if (test_bit(FirstUse, &r->flags))
> + new_devs++;
> + }
> +
> + if (!rebuilds) {
> + if (new_devs == mddev->raid_disks) {
> + printk(KERN_INFO "md: %s: Superblocks created for new array\n", mdname(mddev));
> + } else if (new_devs) {
> + printk(KERN_ERR "md: %s: New device injected "
> + "into existing array without 'rebuild' "
> + "parameter specified\n", mdname(mddev));
> + return -EINVAL;
> + }
> + } else if (new_devs) {
> + printk(KERN_ERR "md: %s: 'rebuild' devices cannot be "
> + "injected into an array with other "
> + "first-time devices\n", mdname(mddev));
> + return -EINVAL;
> + } else if (mddev->recovery_cp != MaxSector) {
> + printk(KERN_ERR "md: %s: 'rebuild' specified while "
> + "array is not in-sync\n",
> + mdname(mddev));
> + return -EINVAL;
> + }
> +
> + /*
> + * Now we set the Faulty bit for those devices that are
> + * recorded in the superblock as failed.
> + */
> + rdev_for_each(r, t, mddev) {
> + if (!r->sb_page)
> + continue;
> + sb2 = (struct mdp_superblock_2 *)
> + page_address(r->sb_page);
> + sb2->failed_devices = 0;
> +
> + if ((r->raid_disk >= 0) &&
> + (failed_devices & (1 << r->raid_disk))) {
> + if (test_bit(FirstUse, &r->flags)) {
> + char b[BDEVNAME_SIZE];
> + printk(KERN_INFO
> + "md: %s: Starting complete rebuild of "
> + "previously failed device, %s\n",
> + mdname(mddev), bdevname(rdev->bdev, b));
> + } else {
> + set_bit(Faulty, &r->flags);
> + }
> + }
> + }
> +
> + return 0;
> +}
> +
> +static int super_2_validate(mddev_t *mddev, mdk_rdev_t *rdev)
> +{
> + struct mdp_superblock_2 *sb;
> +
> + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page);
> +
> + /*
> + * mddev->events is set during the first call to super_2_validate,
> + * so we use that knowledge to kick off some global sanity checks
> + * on the first call.
> + */
> + if (!mddev->events && super_2_init_validation(mddev, rdev))
> + return -EINVAL;
> +
> + rdev->mddev->bitmap_info.offset = 0; /* disable bitmap creation */
> + rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
> + if (!test_bit(FirstUse, &rdev->flags)) {
> + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
> + if (rdev->recovery_offset != MaxSector)
> + clear_bit(In_sync, &rdev->flags);
> + }
> +
> + if (test_bit(Faulty, &rdev->flags)) {
> + clear_bit(Faulty, &rdev->flags);
> + clear_bit(In_sync, &rdev->flags);
> + rdev->recovery_offset = 0;
> + printk(KERN_INFO "md: %s: Dev #%d previously marked as failed\n",
> + mdname(mddev), rdev->raid_disk);
> + }
> +
> + clear_bit(FirstUse, &rdev->flags);
> + return 0;
> +}
> +
> +static unsigned long long
> +super_2_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
> +{
> + /*
> + * Arrays built through device-mapper must use device-mapper
> + * tables to change the size. A call to this function is
> + * invalid for this array.
> + */
> + printk(KERN_ERR "md: %s: Invalid device size change request.\n",
> + mdname(rdev->mddev));
> + return 0;
> +}
> +
> static struct super_type super_types[] = {
> [0] = {
> .name = "0.90.0",
> @@ -1748,6 +2047,14 @@ static struct super_type super_types[] =
> .sync_super = super_1_sync,
> .rdev_size_change = super_1_rdev_size_change,
> },
> + [2] = {
> + .name = "dm",
> + .owner = THIS_MODULE,
> + .load_super = super_2_load,
> + .validate_super = super_2_validate,
> + .sync_super = super_2_sync,
> + .rdev_size_change = super_2_rdev_size_change,
> + },
> };
>
> static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
> Index: linux-2.6/drivers/md/md.h
> ===================================================================
> --- linux-2.6.orig/drivers/md/md.h
> +++ linux-2.6/drivers/md/md.h
> @@ -77,6 +77,8 @@ struct mdk_rdev_s
> #define Blocked 8 /* An error occurred on an externally
> * managed array, don't allow writes
> * until it is cleared */
> +#define FirstUse 9 /* Used by device-mapper interface when
> + * initializing first-time devices. */
> wait_queue_head_t blocked_wait;
>
> int desc_nr; /* descriptor index in the superblock */
> @@ -124,6 +126,7 @@ struct mddev_s
> #define MD_CHANGE_DEVS 0 /* Some device status has changed */
> #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
> #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
> +#define MD_SYNC_STATE_FORCED 3 /* recovery_cp is set and must be honored */
>
> int suspended;
> atomic_t active_io;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2011-05-25 4:16 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-05-24 3:07 [PATCH 7 of 9] MD: new sb type Jonathan Brassow
2011-05-25 4:16 ` NeilBrown [this message]
2011-05-25 14:40 ` Jonathan Brassow
2011-05-26 0:34 ` NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110525141645.24e8fe29@notabene.brown \
--to=neilb@suse.de \
--cc=jbrassow@redhat.com \
--cc=linux-raid@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).