From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Price Date: Thu, 06 Jun 2013 13:19:49 +0100 Subject: [Cluster-devel] [PATCH 2/4] mkfs.gfs2: Align resource groups to RAID stripes In-Reply-To: <1370520368.2744.7.camel@menhir> References: <1370520213-29676-1-git-send-email-anprice@redhat.com> <1370520213-29676-2-git-send-email-anprice@redhat.com> <1370520368.2744.7.camel@menhir> Message-ID: <51B07E65.8070701@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit On 06/06/13 13:06, Steven Whitehouse wrote: > Hi, > > On Thu, 2013-06-06 at 13:03 +0100, Andrew Price wrote: >> This patch uses the values provided by libblkid to align resource groups >> to RAID stripes. The strategy we're using here is to give the start of >> each rgrp an alignment to the stripe width and add an offset of one >> stripe unit for the next rgrp and so on. This should ensure that the >> rgrp headers are spread evenly over the array to minimise contention on >> the bitmap blocks. >> >> One challenge here was to avoid creating large gaps between rgrps and at >> the end of the device due to the alignment padding. We get around this >> by calculating the start and length of the next rgrp before fixing the >> length of the current rgrp and extending it (or shrinking the final one) >> as appropriate. >> >> In order for this to work some relationships between block and stripe >> sizes have been enforced: the stripe width must be a multiple of the >> stripe unit and the stripe unit must be a multiple of the block size. >> >> With this patch, specifying an rg size on the command line still gives >> aligned rgrps but gaps will still be present. >> >> Signed-off-by: Andrew Price >> --- >> gfs2/mkfs/main_mkfs.c | 136 ++++++++++++++++++++++++++++++++++---------------- >> 1 file changed, 92 insertions(+), 44 deletions(-) >> >> diff --git a/gfs2/mkfs/main_mkfs.c b/gfs2/mkfs/main_mkfs.c >> index 12a259f..058e4fa 100644 >> --- a/gfs2/mkfs/main_mkfs.c >> +++ b/gfs2/mkfs/main_mkfs.c >> @@ -580,26 +580,62 @@ static int writerg(int fd, const struct rgrp_tree *rgt, const unsigned bsize) >> return 0; >> } >> >> -static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts) >> +static uint64_t align_block(const uint64_t base, const uint64_t align, const uint32_t bsize) >> +{ >> + if ((align > 0) && ((base % align) > 0)) >> + return (base - (base % align)) + align; >> + return base; >> +} >> + > This doesn't appear to use bsize. Is align always going to be a power of > two? Hrm good catch, I was sure I had removed the bsize parameter when I switched it from a previous version which converted to bytes and back again. Anyway, align is always going to be a multiple of bsize due to the constraints on swidth, but there's no requirement for it to be a power of two, I don't think. Andy > > Steve. > >> +static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts, const struct mkfs_dev *dev) >> { >> struct rgrp_tree *rgt = NULL; >> uint64_t rgaddr = 0; >> - unsigned int i = 0; >> + uint64_t nextaddr = 0; >> + uint64_t rglen = (sdp->rgsize << 20) / sdp->bsize; >> + const uint64_t maxrgsz = (GFS2_MAX_RGSIZE << 20) / sdp->bsize; >> + const uint64_t minrgsz = (GFS2_MIN_RGSIZE << 20) / sdp->bsize; >> + unsigned sunit_blocks = opts->sunit / sdp->bsize; >> + unsigned swidth_blocks = opts->swidth / opts->bsize; >> + unsigned stripe_offset = 0; >> int err = 0; >> >> - sdp->device.length -= sdp->sb_addr + 1; >> - sdp->new_rgrps = how_many_rgrps(sdp, &sdp->device, opts->got_rgsize); >> - rgaddr = sdp->sb_addr + 1; >> + sdp->new_rgrps = 0; >> + rgaddr = align_block(sdp->sb_addr + 1, swidth_blocks, sdp->bsize); >> >> - for (i = 0; i < sdp->new_rgrps; i++) { >> - /* TODO: align to RAID stripes, etc. */ >> + while (rgaddr > 0) { >> rgt = rgrp_insert(&sdp->rgtree, rgaddr); >> if (rgt == NULL) >> return -1; >> - if (i == 0) >> - rgt->length = sdp->device.length - ((sdp->new_rgrps - 1) * (sdp->device.length / sdp->new_rgrps)); >> + >> + stripe_offset += sunit_blocks; >> + if (stripe_offset >= swidth_blocks) >> + stripe_offset = 0; >> + >> + /* The next rg might not fit into the remaining space so calculate it now >> + in order to make decisions about the current rg */ >> + nextaddr = align_block(rgaddr + rglen, swidth_blocks, sdp->bsize) + stripe_offset; >> + if (!opts->got_rgsize && (nextaddr - rgaddr) <= maxrgsz) >> + /* Use up gap left by alignment if possible */ >> + rgt->length = nextaddr - rgaddr; >> else >> - rgt->length = sdp->device.length / sdp->new_rgrps; >> + rgt->length = rglen; >> + >> + /* If the next rg would overflow the device, either shrink it or expand >> + the current rg to use the remaining space */ >> + if (nextaddr + rglen > sdp->device.length) { >> + /* Squeeze the last 1 or 2 rgs into the remaining space */ >> + if ((nextaddr < sdp->device.length) && (sdp->device.length - nextaddr >= minrgsz)) { >> + rglen = sdp->device.length - nextaddr; >> + } else { >> + if (sdp->device.length - rgaddr <= maxrgsz) >> + rgt->length = sdp->device.length - rgaddr; >> + else >> + rgt->length = maxrgsz; >> + /* This is the last rg */ >> + nextaddr = 0; >> + } >> + } >> >> /* Build the rindex entry */ >> rgt->ri.ri_length = rgblocks2bitblocks(sdp->bsize, rgt->length, &rgt->ri.ri_data); >> @@ -614,6 +650,11 @@ static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts) >> rgt->rg.rg_header.mh_format = GFS2_FORMAT_RG; >> rgt->rg.rg_free = rgt->ri.ri_data; >> >> + if (opts->debug) { >> + gfs2_rindex_print(&rgt->ri); >> + printf(" stripe_offset: %u\n", stripe_offset); >> + } >> + >> /* TODO: This call allocates buffer heads and bitmap pointers >> * in rgt. We really shouldn't need to do that. */ >> err = gfs2_compute_bitstructs(sdp->bsize, rgt); >> @@ -628,8 +669,9 @@ static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts) >> perror(_("Failed to write resource group")); >> return -1; >> } >> + sdp->new_rgrps++; >> sdp->blks_total += rgt->ri.ri_data; >> - rgaddr += rgt->length; >> + rgaddr = nextaddr; >> } >> >> sdp->rgrps = sdp->new_rgrps; >> @@ -637,7 +679,7 @@ static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts) >> return 0; >> } >> >> -static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_dev *dev) >> +static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_dev *dev, unsigned bsize) >> { >> memset(sdp, 0, sizeof(struct gfs2_sbd)); >> sdp->time = time(NULL); >> @@ -647,7 +689,7 @@ static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_d >> sdp->jsize = opts->jsize; >> sdp->md.journals = opts->journals; >> sdp->device_fd = dev->fd; >> - sdp->bsize = choose_blocksize(opts, dev); >> + sdp->bsize = bsize; >> >> if (compute_constants(sdp)) { >> perror(_("Failed to compute file system constants")); >> @@ -666,19 +708,6 @@ static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_d >> } >> strcpy(sdp->lockproto, opts->lockproto); >> strcpy(sdp->locktable, opts->locktable); >> - if (opts->debug) { >> - printf(_("Calculated file system options:\n")); >> - printf(" bsize = %u\n", sdp->bsize); >> - printf(" qcsize = %u\n", sdp->qcsize); >> - printf(" jsize = %u\n", sdp->jsize); >> - printf(" journals = %u\n", sdp->md.journals); >> - printf(" proto = %s\n", sdp->lockproto); >> - printf(" rgsize = %u\n", sdp->rgsize); >> - printf(" table = %s\n", sdp->locktable); >> - printf(" fssize = %"PRIu64"\n", opts->fssize); >> - printf(" sunit = %lu\n", opts->sunit); >> - printf(" swidth = %lu\n", opts->swidth); >> - } >> } >> >> static int probe_contents(struct mkfs_dev *dev) >> @@ -764,6 +793,24 @@ static void open_dev(const char *path, struct mkfs_dev *dev) >> exit(1); >> } >> >> +static void opts_set_stripe(struct mkfs_opts *opts, const struct mkfs_dev *dev, unsigned bsize) >> +{ >> + if (!opts->got_swidth && dev->optimal_io_size > dev->physical_sector_size) { >> + opts->swidth = dev->optimal_io_size; >> + opts->got_swidth = 1; >> + } >> + >> + if (!opts->got_sunit && dev->minimum_io_size > dev->physical_sector_size) { >> + opts->sunit = dev->minimum_io_size; >> + opts->got_sunit = 1; >> + } >> + >> + if (opts->got_sunit && (opts->sunit % bsize) != 0) { >> + fprintf(stderr, "Stripe unit (%lu) is not a multiple of the block size (%u)\n", opts->sunit, bsize); >> + exit(1); >> + } >> +} >> + >> void main_mkfs(int argc, char *argv[]) >> { >> struct gfs2_sbd sbd; >> @@ -771,28 +818,15 @@ void main_mkfs(int argc, char *argv[]) >> struct mkfs_dev dev; >> int error; >> unsigned char uuid[16]; >> + unsigned bsize; >> >> opts_init(&opts); >> opts_get(argc, argv, &opts); >> opts_check(&opts); >> >> open_dev(opts.device, &dev); >> - if (!opts.got_swidth) { >> - if (dev.optimal_io_size > 0) >> - opts.swidth = dev.optimal_io_size; >> - else >> - opts.swidth = dev.logical_sector_size; >> - } >> - >> - if (!opts.got_sunit) { >> - if (dev.minimum_io_size > 0) >> - opts.sunit = dev.minimum_io_size; >> - else >> - opts.sunit = dev.logical_sector_size; >> - } >> - >> - if (opts.debug) >> - printf("Resource group alignment: %"PRIu64" bytes\n", opts.swidth); >> + bsize = choose_blocksize(&opts, &dev); >> + opts_set_stripe(&opts, &dev, bsize); >> >> if (S_ISREG(dev.stat.st_mode)) { >> opts.got_bsize = 1; /* Use default block size for regular files */ >> @@ -800,7 +834,21 @@ void main_mkfs(int argc, char *argv[]) >> >> warn_of_destruction(opts.device); >> >> - sbd_init(&sbd, &opts, &dev); >> + sbd_init(&sbd, &opts, &dev, bsize); >> + if (opts.debug) { >> + printf(_("Calculated file system options:\n")); >> + printf(" bsize = %u\n", sbd.bsize); >> + printf(" qcsize = %u\n", sbd.qcsize); >> + printf(" jsize = %u\n", sbd.jsize); >> + printf(" journals = %u\n", sbd.md.journals); >> + printf(" proto = %s\n", sbd.lockproto); >> + printf(" rgsize = %u\n", sbd.rgsize); >> + printf(" table = %s\n", sbd.locktable); >> + printf(" fssize = %"PRIu64"\n", opts.fssize); >> + printf(" sunit = %lu\n", opts.sunit); >> + printf(" swidth = %lu\n", opts.swidth); >> + printf(" rgrp align = %lu+%lu blocks\n", opts.swidth/sbd.bsize, opts.sunit/sbd.bsize); >> + } >> >> if (opts.confirm && !opts.override) >> are_you_sure(); >> @@ -808,7 +856,7 @@ void main_mkfs(int argc, char *argv[]) >> if (!S_ISREG(dev.stat.st_mode) && opts.discard) >> discard_blocks(dev.fd, sbd.bsize * sbd.device.length, opts.debug); >> >> - error = place_rgrps(&sbd, &opts); >> + error = place_rgrps(&sbd, &opts, &dev); >> if (error) { >> fprintf(stderr, _("Failed to build resource groups\n")); >> exit(1); > >