cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
From: Andrew Price <anprice@redhat.com>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] [PATCH 2/4] mkfs.gfs2: Align resource groups to RAID stripes
Date: Thu,  6 Jun 2013 13:03:31 +0100	[thread overview]
Message-ID: <1370520213-29676-2-git-send-email-anprice@redhat.com> (raw)
In-Reply-To: <1370520213-29676-1-git-send-email-anprice@redhat.com>

This patch uses the values provided by libblkid to align resource groups
to RAID stripes. The strategy we're using here is to give the start of
each rgrp an alignment to the stripe width and add an offset of one
stripe unit for the next rgrp and so on. This should ensure that the
rgrp headers are spread evenly over the array to minimise contention on
the bitmap blocks.

One challenge here was to avoid creating large gaps between rgrps and at
the end of the device due to the alignment padding. We get around this
by calculating the start and length of the next rgrp before fixing the
length of the current rgrp and extending it (or shrinking the final one)
as appropriate.

In order for this to work some relationships between block and stripe
sizes have been enforced: the stripe width must be a multiple of the
stripe unit and the stripe unit must be a multiple of the block size.

With this patch, specifying an rg size on the command line still gives
aligned rgrps but gaps will still be present.

Signed-off-by: Andrew Price <anprice@redhat.com>
---
 gfs2/mkfs/main_mkfs.c | 136 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 92 insertions(+), 44 deletions(-)

diff --git a/gfs2/mkfs/main_mkfs.c b/gfs2/mkfs/main_mkfs.c
index 12a259f..058e4fa 100644
--- a/gfs2/mkfs/main_mkfs.c
+++ b/gfs2/mkfs/main_mkfs.c
@@ -580,26 +580,62 @@ static int writerg(int fd, const struct rgrp_tree *rgt, const unsigned bsize)
 	return 0;
 }
 
-static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts)
+static uint64_t align_block(const uint64_t base, const uint64_t align, const uint32_t bsize)
+{
+	if ((align > 0) && ((base % align) > 0))
+		return (base - (base % align)) + align;
+	return base;
+}
+
+static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts, const struct mkfs_dev *dev)
 {
 	struct rgrp_tree *rgt = NULL;
 	uint64_t rgaddr = 0;
-	unsigned int i = 0;
+	uint64_t nextaddr = 0;
+	uint64_t rglen = (sdp->rgsize << 20) / sdp->bsize;
+	const uint64_t maxrgsz = (GFS2_MAX_RGSIZE << 20) / sdp->bsize;
+	const uint64_t minrgsz = (GFS2_MIN_RGSIZE << 20) / sdp->bsize;
+	unsigned sunit_blocks = opts->sunit / sdp->bsize;
+	unsigned swidth_blocks = opts->swidth / opts->bsize;
+	unsigned stripe_offset = 0;
 	int err = 0;
 
-	sdp->device.length -= sdp->sb_addr + 1;
-	sdp->new_rgrps = how_many_rgrps(sdp, &sdp->device, opts->got_rgsize);
-	rgaddr = sdp->sb_addr + 1;
+	sdp->new_rgrps = 0;
+	rgaddr = align_block(sdp->sb_addr + 1, swidth_blocks, sdp->bsize);
 
-	for (i = 0; i < sdp->new_rgrps; i++) {
-		/* TODO: align to RAID stripes, etc. */
+	while (rgaddr > 0) {
 		rgt = rgrp_insert(&sdp->rgtree, rgaddr);
 		if (rgt == NULL)
 			return -1;
-		if (i == 0)
-			rgt->length = sdp->device.length - ((sdp->new_rgrps - 1) * (sdp->device.length / sdp->new_rgrps));
+
+		stripe_offset += sunit_blocks;
+		if (stripe_offset >= swidth_blocks)
+			stripe_offset = 0;
+
+		/* The next rg might not fit into the remaining space so calculate it now
+		   in order to make decisions about the current rg */
+		nextaddr = align_block(rgaddr + rglen, swidth_blocks, sdp->bsize) + stripe_offset;
+		if (!opts->got_rgsize && (nextaddr - rgaddr) <= maxrgsz)
+			/* Use up gap left by alignment if possible */
+			rgt->length = nextaddr - rgaddr;
 		else
-			rgt->length = sdp->device.length / sdp->new_rgrps;
+			rgt->length = rglen;
+
+		/* If the next rg would overflow the device, either shrink it or expand
+		   the current rg to use the remaining space */
+		if (nextaddr + rglen > sdp->device.length) {
+			/* Squeeze the last 1 or 2 rgs into the remaining space */
+			if ((nextaddr < sdp->device.length) && (sdp->device.length - nextaddr >= minrgsz)) {
+				rglen = sdp->device.length - nextaddr;
+			} else {
+				if (sdp->device.length - rgaddr <= maxrgsz)
+					rgt->length = sdp->device.length - rgaddr;
+				else
+					rgt->length = maxrgsz;
+				/* This is the last rg */
+				nextaddr = 0;
+			}
+		}
 
 		/* Build the rindex entry */
 		rgt->ri.ri_length = rgblocks2bitblocks(sdp->bsize, rgt->length, &rgt->ri.ri_data);
@@ -614,6 +650,11 @@ static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts)
 		rgt->rg.rg_header.mh_format = GFS2_FORMAT_RG;
 		rgt->rg.rg_free = rgt->ri.ri_data;
 
+		if (opts->debug) {
+			gfs2_rindex_print(&rgt->ri);
+			printf(" stripe_offset: %u\n", stripe_offset);
+		}
+
 		/* TODO: This call allocates buffer heads and bitmap pointers
 		 * in rgt. We really shouldn't need to do that. */
 		err = gfs2_compute_bitstructs(sdp->bsize, rgt);
@@ -628,8 +669,9 @@ static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts)
 			perror(_("Failed to write resource group"));
 			return -1;
 		}
+		sdp->new_rgrps++;
 		sdp->blks_total += rgt->ri.ri_data;
-		rgaddr += rgt->length;
+		rgaddr = nextaddr;
 	}
 
 	sdp->rgrps = sdp->new_rgrps;
@@ -637,7 +679,7 @@ static int place_rgrps(struct gfs2_sbd *sdp, const struct mkfs_opts *opts)
 	return 0;
 }
 
-static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_dev *dev)
+static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_dev *dev, unsigned bsize)
 {
 	memset(sdp, 0, sizeof(struct gfs2_sbd));
 	sdp->time = time(NULL);
@@ -647,7 +689,7 @@ static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_d
 	sdp->jsize = opts->jsize;
 	sdp->md.journals = opts->journals;
 	sdp->device_fd = dev->fd;
-	sdp->bsize = choose_blocksize(opts, dev);
+	sdp->bsize = bsize;
 
 	if (compute_constants(sdp)) {
 		perror(_("Failed to compute file system constants"));
@@ -666,19 +708,6 @@ static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, struct mkfs_d
 	}
 	strcpy(sdp->lockproto, opts->lockproto);
 	strcpy(sdp->locktable, opts->locktable);
-	if (opts->debug) {
-		printf(_("Calculated file system options:\n"));
-		printf("  bsize = %u\n", sdp->bsize);
-		printf("  qcsize = %u\n", sdp->qcsize);
-		printf("  jsize = %u\n", sdp->jsize);
-		printf("  journals = %u\n", sdp->md.journals);
-		printf("  proto = %s\n", sdp->lockproto);
-		printf("  rgsize = %u\n", sdp->rgsize);
-		printf("  table = %s\n", sdp->locktable);
-		printf("  fssize = %"PRIu64"\n", opts->fssize);
-		printf("  sunit = %lu\n", opts->sunit);
-		printf("  swidth = %lu\n", opts->swidth);
-	}
 }
 
 static int probe_contents(struct mkfs_dev *dev)
@@ -764,6 +793,24 @@ static void open_dev(const char *path, struct mkfs_dev *dev)
 		exit(1);
 }
 
+static void opts_set_stripe(struct mkfs_opts *opts, const struct mkfs_dev *dev, unsigned bsize)
+{
+	if (!opts->got_swidth && dev->optimal_io_size > dev->physical_sector_size) {
+		opts->swidth = dev->optimal_io_size;
+		opts->got_swidth = 1;
+	}
+
+	if (!opts->got_sunit && dev->minimum_io_size > dev->physical_sector_size) {
+		opts->sunit = dev->minimum_io_size;
+		opts->got_sunit = 1;
+	}
+
+	if (opts->got_sunit && (opts->sunit % bsize) != 0) {
+		fprintf(stderr, "Stripe unit (%lu) is not a multiple of the block size (%u)\n", opts->sunit, bsize);
+		exit(1);
+	}
+}
+
 void main_mkfs(int argc, char *argv[])
 {
 	struct gfs2_sbd sbd;
@@ -771,28 +818,15 @@ void main_mkfs(int argc, char *argv[])
 	struct mkfs_dev dev;
 	int error;
 	unsigned char uuid[16];
+	unsigned bsize;
 
 	opts_init(&opts);
 	opts_get(argc, argv, &opts);
 	opts_check(&opts);
 
 	open_dev(opts.device, &dev);
-	if (!opts.got_swidth) {
-		if (dev.optimal_io_size > 0)
-			opts.swidth = dev.optimal_io_size;
-		else
-			opts.swidth = dev.logical_sector_size;
-	}
-
-	if (!opts.got_sunit) {
-		if (dev.minimum_io_size > 0)
-			opts.sunit = dev.minimum_io_size;
-		else
-			opts.sunit = dev.logical_sector_size;
-	}
-
-	if (opts.debug)
-		printf("Resource group alignment: %"PRIu64" bytes\n", opts.swidth);
+	bsize = choose_blocksize(&opts, &dev);
+	opts_set_stripe(&opts, &dev, bsize);
 
 	if (S_ISREG(dev.stat.st_mode)) {
 		opts.got_bsize = 1; /* Use default block size for regular files */
@@ -800,7 +834,21 @@ void main_mkfs(int argc, char *argv[])
 
 	warn_of_destruction(opts.device);
 
-	sbd_init(&sbd, &opts, &dev);
+	sbd_init(&sbd, &opts, &dev, bsize);
+	if (opts.debug) {
+		printf(_("Calculated file system options:\n"));
+		printf("  bsize = %u\n", sbd.bsize);
+		printf("  qcsize = %u\n", sbd.qcsize);
+		printf("  jsize = %u\n", sbd.jsize);
+		printf("  journals = %u\n", sbd.md.journals);
+		printf("  proto = %s\n", sbd.lockproto);
+		printf("  rgsize = %u\n", sbd.rgsize);
+		printf("  table = %s\n", sbd.locktable);
+		printf("  fssize = %"PRIu64"\n", opts.fssize);
+		printf("  sunit = %lu\n", opts.sunit);
+		printf("  swidth = %lu\n", opts.swidth);
+		printf("  rgrp align = %lu+%lu blocks\n", opts.swidth/sbd.bsize, opts.sunit/sbd.bsize);
+	}
 
 	if (opts.confirm && !opts.override)
 		are_you_sure();
@@ -808,7 +856,7 @@ void main_mkfs(int argc, char *argv[])
 	if (!S_ISREG(dev.stat.st_mode) && opts.discard)
 		discard_blocks(dev.fd, sbd.bsize * sbd.device.length, opts.debug);
 
-	error = place_rgrps(&sbd, &opts);
+	error = place_rgrps(&sbd, &opts, &dev);
 	if (error) {
 		fprintf(stderr, _("Failed to build resource groups\n"));
 		exit(1);
-- 
1.8.1.4



  reply	other threads:[~2013-06-06 12:03 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-06 12:03 [Cluster-devel] [PATCH 1/4] mkfs.gfs2: Set sunit and swidth from probed io limits Andrew Price
2013-06-06 12:03 ` Andrew Price [this message]
2013-06-06 12:06   ` [Cluster-devel] [PATCH 2/4] mkfs.gfs2: Align resource groups to RAID stripes Steven Whitehouse
2013-06-06 12:19     ` Andrew Price
2013-06-06 12:57   ` Bob Peterson
2013-06-06 13:04     ` Andrew Price
2013-06-06 13:17       ` Bob Peterson
2013-06-06 13:11     ` Steven Whitehouse
2013-06-06 13:30       ` Bob Peterson
2013-06-06 15:17       ` Andrew Price
2013-06-06 12:03 ` [Cluster-devel] [PATCH 3/4] mkfs.gfs2: Create new resource groups on-demand Andrew Price
2013-06-06 13:07   ` Bob Peterson
2013-06-06 13:50     ` Andrew Price
2013-06-06 12:03 ` [Cluster-devel] [PATCH 4/4] mkfs.gfs2: Add align option and update docs Andrew Price
2013-06-06 12:15   ` Steven Whitehouse
2013-06-06 12:45     ` Andrew Price
2013-06-06 12:53       ` Steven Whitehouse
2013-06-06 13:13   ` Bob Peterson
2013-06-06 13:53     ` Andrew Price

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1370520213-29676-2-git-send-email-anprice@redhat.com \
    --to=anprice@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).