[PATCH] Implement shrink of empty AGs

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

From: Iustin Pop <iusty@k1024.org>
To: xfs@oss.sgi.com
Subject: [PATCH] Implement shrink of empty AGs
Date: Sun, 10 Jun 2007 18:40:14 +0200	[thread overview]
Message-ID: <20070610164014.GA10936@teal.hq.k1024.org> (raw)


[-- Attachment #1.1: Type: text/plain, Size: 1412 bytes --]

The attached patch implements shrinking of completely empty allocation
groups. The patch is against current CVS and modifies two files:
  - xfs_trans.c to remove two asserts in which prevent lowering the
    number of AGs or filesystem blocks;
  - xfs_fsops.c where it does:
    - modify xfs_growfs_data() to branch to either
      xfs_growfs_data_private or xfs_shrinkfs_data private depending on
      the new size of the fs
    - abstract the last part of xfs_growfs_data_private (the modify of
      all the superblocks) into a separate function, xfs_update_sb(),
      which is called both from shrink and grow
    - add the new xfs_shrinkfs_data_private function, mostly based on
      the growfs function

There are many printk()'s left in the patch, I left them as they show
where I compute some important values. There are also many FIXMEs in the
comments showing what parts I didn't understand or was not sure about
(not that these are the only ones...). Probably for a real patch,
xfs-specific debug hooks need to be added and the printk()s removed.

The patch works on UML and QEMU virtual machines, both in UP and SMP. I
just tested many shrink/grow operations and verified with xfs_repair
that the fs is not corrupted. The free space counters seem to be correct
after shrink.

Note that you also need to remove the check from xfs_growfs.c of not
allowing to shrink the filesystem.

regards,
iustin

[-- Attachment #1.2: patch-nice-4 --]
[-- Type: text/plain, Size: 11582 bytes --]

diff -X ignore -urN linux-2.6-xfs.cvs-orig/fs/xfs/xfs_fsops.c linux-2.6-xfs.shrink/fs/xfs/xfs_fsops.c
--- linux-2.6-xfs.cvs-orig/fs/xfs/xfs_fsops.c	2007-06-09 18:56:21.509308225 +0200
+++ linux-2.6-xfs.shrink/fs/xfs/xfs_fsops.c	2007-06-10 18:32:36.074856477 +0200
@@ -112,6 +112,53 @@
 	return 0;
 }
 
+static void xfs_update_sb(
+	xfs_mount_t		*mp,		/* mount point for filesystem */
+	xfs_agnumber_t		nagimax,
+	xfs_agnumber_t		nagcount)       /* new number of a.g. */
+{
+	xfs_agnumber_t		agno;
+	xfs_buf_t		*bp;
+	xfs_sb_t		*sbp;
+	int			error;
+
+	/* New allocation groups fully initialized, so update mount struct */
+	if (nagimax)
+		mp->m_maxagi = nagimax;
+	if (mp->m_sb.sb_imax_pct) {
+		__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+	for (agno = 1; agno < nagcount; agno++) {
+		error = xfs_read_buf(mp, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+		if (error) {
+			xfs_fs_cmn_err(CE_WARN, mp,
+			"error %d reading secondary superblock for ag %d",
+				error, agno);
+			break;
+		}
+		sbp = XFS_BUF_TO_SBP(bp);
+		xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
+		/*
+		 * If we get an error writing out the alternate superblocks,
+		 * just issue a warning and continue.  The real work is
+		 * already done and committed.
+		 */
+		if (!(error = xfs_bwrite(mp, bp))) {
+			continue;
+		} else {
+			xfs_fs_cmn_err(CE_WARN, mp,
+		"write error %d updating secondary superblock for ag %d",
+				error, agno);
+			break; /* no point in continuing */
+		}
+	}
+}
+
 static int
 xfs_growfs_data_private(
 	xfs_mount_t		*mp,		/* mount point for filesystem */
@@ -135,7 +182,6 @@
 	xfs_rfsblock_t		nfree;
 	xfs_agnumber_t		oagcount;
 	int			pct;
-	xfs_sb_t		*sbp;
 	xfs_trans_t		*tp;
 
 	nb = in->newblocks;
@@ -356,44 +402,228 @@
 	if (error) {
 		return error;
 	}
-	/* New allocation groups fully initialized, so update mount struct */
-	if (nagimax)
-		mp->m_maxagi = nagimax;
-	if (mp->m_sb.sb_imax_pct) {
-		__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
-		do_div(icount, 100);
-		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
-	} else
-		mp->m_maxicount = 0;
-	for (agno = 1; agno < nagcount; agno++) {
-		error = xfs_read_buf(mp, mp->m_ddev_targp,
-				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+	xfs_update_sb(mp, nagimax, nagcount);
+	return 0;
+
+ error0:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+static int
+xfs_shrinkfs_data_private(
+			  xfs_mount_t		*mp,		/* mount point for filesystem */
+			  xfs_growfs_data_t	*in)		/* growfs data input struct */
+{
+	xfs_agf_t		*agf;
+	xfs_agnumber_t		agno;
+	xfs_buf_t		*bp;
+	int			dpct;
+	int			error;
+	xfs_agnumber_t		nagcount;       /* new AG count */
+	xfs_agnumber_t		oagcount;       /* old AG count */
+	xfs_agnumber_t		nagimax = 0;
+	xfs_rfsblock_t		nb, nb_mod;
+	xfs_rfsblock_t		dbdelta;        /* will be used as a
+						   check that we
+						   shrink the fs by
+						   the correct number
+						   of blocks */
+	xfs_rfsblock_t		fdbdelta;       /* will keep track of
+						   how many ag blocks
+						   we need to
+						   remove */
+	int			pct;
+	xfs_trans_t		*tp;
+
+	nb = in->newblocks;
+	pct = in->imaxpct;
+	if (nb >= mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+		return XFS_ERROR(EINVAL);
+	dpct = pct - mp->m_sb.sb_imax_pct;
+	error = xfs_read_buf(mp, mp->m_ddev_targp,
+			     XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+			     XFS_FSS_TO_BB(mp, 1), 0, &bp);
+	if (error)
+		return error;
+	ASSERT(bp);
+	/* FIXME: we release the buffer here manually because we are
+	 * outside of a transaction? The other buffers read using the
+	 * functions which take a tp parameter are not released in
+	 * growfs
+	 */
+	xfs_buf_relse(bp);
+
+	/* Do basic checks (at the fs level) */
+	oagcount = mp->m_sb.sb_agcount;
+	nagcount = nb;
+	nb_mod = do_div(nagcount, mp->m_sb.sb_agblocks);
+	if(nb_mod) {
+		printk("not shrinking on an AG boundary (diff=%d)\n", nb_mod);
+		return XFS_ERROR(ENOSPC);
+	}
+	if(nagcount < 2) {
+		printk("refusing to shrink below 2 AGs\n");
+		return XFS_ERROR(ENOSPC);
+	}
+	if(nagcount >= oagcount) {
+		printk("number of AGs will not decrease\n");
+		return XFS_ERROR(EINVAL);
+	}
+	printk("Cur ag=%d, cur blocks=%llu\n",
+	       mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
+	printk("New ag=%d, new blocks=%d\n", nagcount, nb);
+
+	printk("Will resize from %llu to %d, delta is %llu\n",
+               mp->m_sb.sb_dblocks, nb, mp->m_sb.sb_dblocks - nb);
+	/* Check to see if we trip over the log section */
+	printk("logstart=%llu logblocks=%u\n",
+               mp->m_sb.sb_logstart, mp->m_sb.sb_logblocks);
+	if (nb < mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks)
+		return XFS_ERROR(EINVAL);
+	/* dbdelta starts at the diff and must become zero */
+	dbdelta = mp->m_sb.sb_dblocks - nb;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+	printk("reserving %d\n", XFS_GROWFS_SPACE_RES(mp) + dbdelta);
+	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp) + dbdelta,
+				       XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	fdbdelta = 0;
+
+	/* Per-AG checks */
+	/* FIXME: do we need to hold m_peraglock while doing this? */
+	/* I think that since we do read and write to the m_perag
+	 * stuff, we should be holding the lock for the entire walk &
+	 * modify of the fs
+	 */
+	/* Note that because we hold the lock, on any error+early
+	 *  return, we must either release manually and return, or
+	 *  jump to error0
+	 */
+	down_write(&mp->m_peraglock);
+	for(agno = oagcount - 1; agno >= nagcount; agno--) {
+		xfs_extlen_t	usedblks; /* total used blocks in this a.g. */
+		xfs_extlen_t	freeblks; /* free blocks in this a.g. */
+		xfs_agblock_t	aglen;    /* this ag's len */
+		struct xfs_perag *pag;  /* the m_perag structure */
+
+		printk("doing agno=%d\n", agno);
+
+		pag = &mp->m_perag[agno];
+
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
 		if (error) {
-			xfs_fs_cmn_err(CE_WARN, mp,
-			"error %d reading secondary superblock for ag %d",
-				error, agno);
-			break;
+			goto error0;
 		}
-		sbp = XFS_BUF_TO_SBP(bp);
-		xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
+		ASSERT(bp);
+		agf = XFS_BUF_TO_AGF(bp);
+		aglen = INT_GET(agf->agf_length, ARCH_CONVERT);
+
+		/* read the pagf/pagi if not already initialized */
+		/* agf should be initialized because of the ablove read_agf */
+		ASSERT(pag->pagf_init);
+		if (!pag->pagi_init) {
+			if ((error = xfs_ialloc_read_agi(mp, tp, agno, &bp)))
+				goto error0;
+			ASSERT(pag->pagi_init);
+		}
+
 		/*
-		 * If we get an error writing out the alternate superblocks,
-		 * just issue a warning and continue.  The real work is
-		 * already done and committed.
+		 * Check the inodes: as long as we have pagi_count ==
+		 * pagi_freecount == 0, then: a) we don't have to
+		 * update any global inode counters, and b) there are
+		 * no extra blocks in inode btrees
 		 */
-		if (!(error = xfs_bwrite(mp, bp))) {
-			continue;
-		} else {
-			xfs_fs_cmn_err(CE_WARN, mp,
-		"write error %d updating secondary superblock for ag %d",
-				error, agno);
-			break; /* no point in continuing */
+		if(pag->pagi_count > 0 ||
+		   pag->pagi_freecount > 0) {
+			printk("agi %d has %d inodes in total and %d free\n",
+			       agno, pag->pagi_count, pag->pagi_freecount);
+			error = XFS_ERROR(ENOSPC);
+			goto error0;
+		}
+
+		/* Check the AGF: if levels[] == 1, then there should
+		 *  be no extra blocks in the btrees beyond the ones
+		 *  at the beggining of the AG
+		 */
+		if(pag->pagf_levels[XFS_BTNUM_BNOi] > 1 ||
+		   pag->pagf_levels[XFS_BTNUM_CNTi] > 1) {
+			printk("agf %d has level %d bt and %d cnt\n",
+			       agno,
+			       pag->pagf_levels[XFS_BTNUM_BNOi],
+			       pag->pagf_levels[XFS_BTNUM_CNTi]);
+			error = XFS_ERROR(ENOSPC);
+			goto error0;
 		}
+
+		freeblks = pag->pagf_freeblks;
+		printk("Usage: %d prealloc, %d flcount\n",
+		       XFS_PREALLOC_BLOCKS(mp), pag->pagf_flcount);
+
+		/* Done gathering data, check sizes */
+		usedblks = XFS_PREALLOC_BLOCKS(mp) + pag->pagf_flcount;
+		printk("agno=%d agf_length=%d computed used=%d"
+		       " known free=%d\n", agno, aglen, usedblks, freeblks);
+
+		if(usedblks + freeblks != aglen) {
+			printk("agno %d is not free (%d blocks allocated)\n",
+			       agno, aglen-usedblks-freeblks);
+			error = XFS_ERROR(ENOSPC);
+			goto error0;
+		}
+		dbdelta -= aglen;
+		printk("will lower with %d\n",
+		       aglen - XFS_PREALLOC_BLOCKS(mp));
+		fdbdelta += aglen - XFS_PREALLOC_BLOCKS(mp);
+	}
+	/*
+	 * Check that we removed all blocks
+	 */
+	ASSERT(!dbdelta);
+	ASSERT(nagcount < oagcount);
+
+	printk("to free: %d, oagcount=%d, nagcount=%d\n",
+	       fdbdelta, oagcount, nagcount);
+
+	xfs_trans_agblocks_delta(tp, -((long)fdbdelta));
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, nb - mp->m_sb.sb_dblocks);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, -((int64_t)fdbdelta));
+
+	if (dpct)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	error = xfs_trans_commit(tp, 0);
+	if (error) {
+		up_write(&mp->m_peraglock);
+		return error;
 	}
+	/* Free memory as the number of AG has changed */
+	for (agno = nagcount; agno < oagcount; agno++)
+		if (mp->m_perag[agno].pagb_list)
+			kmem_free(mp->m_perag[agno].pagb_list,
+				  sizeof(xfs_perag_busy_t) *
+				  XFS_PAGB_NUM_SLOTS);
+
+	mp->m_perag = kmem_realloc(mp->m_perag,
+				   sizeof(xfs_perag_t) * nagcount,
+				   sizeof(xfs_perag_t) * oagcount,
+				   KM_SLEEP);
+	/* FIXME: here we could instead just lower
+	 * nagimax to nagcount; is it better this way?
+	 */
+	/* FIXME: why is this flag unconditionally set in growfs? */
+	mp->m_flags |= XFS_MOUNT_32BITINODES;
+	nagimax = xfs_initialize_perag(XFS_MTOVFS(mp), mp, nagcount);
+	up_write(&mp->m_peraglock);
+
+	xfs_update_sb(mp, nagimax, nagcount);
 	return 0;
 
  error0:
+	up_write(&mp->m_peraglock);
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 	return error;
 }
@@ -435,7 +665,10 @@
 	int error;
 	if (!cpsema(&mp->m_growlock))
 		return XFS_ERROR(EWOULDBLOCK);
-	error = xfs_growfs_data_private(mp, in);
+	if(in->newblocks < mp->m_sb.sb_dblocks)
+		error = xfs_shrinkfs_data_private(mp, in);
+	else
+		error = xfs_growfs_data_private(mp, in);
 	vsema(&mp->m_growlock);
 	return error;
 }
@@ -633,7 +866,7 @@
 			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 			thaw_bdev(sb->s_bdev, sb);
 		}
-	
+
 		break;
 	}
 	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
diff -X ignore -urN linux-2.6-xfs.cvs-orig/fs/xfs/xfs_trans.c linux-2.6-xfs.shrink/fs/xfs/xfs_trans.c
--- linux-2.6-xfs.cvs-orig/fs/xfs/xfs_trans.c	2007-06-05 17:40:51.000000000 +0200
+++ linux-2.6-xfs.shrink/fs/xfs/xfs_trans.c	2007-06-07 23:01:03.000000000 +0200
@@ -503,11 +503,9 @@
 		tp->t_res_frextents_delta += delta;
 		break;
 	case XFS_TRANS_SB_DBLOCKS:
-		ASSERT(delta > 0);
 		tp->t_dblocks_delta += delta;
 		break;
 	case XFS_TRANS_SB_AGCOUNT:
-		ASSERT(delta > 0);
 		tp->t_agcount_delta += delta;
 		break;
 	case XFS_TRANS_SB_IMAXPCT:

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

next             reply	other threads:[~2007-06-10 16:40 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-06-10 16:40 Iustin Pop [this message]
2007-06-12  2:40 ` [PATCH] Implement shrink of empty AGs David Chinner
2007-06-12  4:25   ` Eric Sandeen
2007-06-14  6:01   ` Iustin Pop
2007-06-14  9:00     ` David Chinner
2007-06-14 20:55       ` Iustin Pop
2007-06-14 22:16         ` David Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070610164014.GA10936@teal.hq.k1024.org \
    --to=iusty@k1024.org \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox