From: Iustin Pop <iusty@k1024.org>
To: xfs@oss.sgi.com
Subject: [PATCH] Implement shrink of empty AGs
Date: Sun, 10 Jun 2007 18:40:14 +0200 [thread overview]
Message-ID: <20070610164014.GA10936@teal.hq.k1024.org> (raw)
[-- Attachment #1.1: Type: text/plain, Size: 1412 bytes --]
The attached patch implements shrinking of completely empty allocation
groups. The patch is against current CVS and modifies two files:
- xfs_trans.c to remove two asserts in which prevent lowering the
number of AGs or filesystem blocks;
- xfs_fsops.c where it does:
- modify xfs_growfs_data() to branch to either
xfs_growfs_data_private or xfs_shrinkfs_data private depending on
the new size of the fs
- abstract the last part of xfs_growfs_data_private (the modify of
all the superblocks) into a separate function, xfs_update_sb(),
which is called both from shrink and grow
- add the new xfs_shrinkfs_data_private function, mostly based on
the growfs function
There are many printk()'s left in the patch, I left them as they show
where I compute some important values. There are also many FIXMEs in the
comments showing what parts I didn't understand or was not sure about
(not that these are the only ones...). Probably for a real patch,
xfs-specific debug hooks need to be added and the printk()s removed.
The patch works on UML and QEMU virtual machines, both in UP and SMP. I
just tested many shrink/grow operations and verified with xfs_repair
that the fs is not corrupted. The free space counters seem to be correct
after shrink.
Note that you also need to remove the check from xfs_growfs.c of not
allowing to shrink the filesystem.
regards,
iustin
[-- Attachment #1.2: patch-nice-4 --]
[-- Type: text/plain, Size: 11582 bytes --]
diff -X ignore -urN linux-2.6-xfs.cvs-orig/fs/xfs/xfs_fsops.c linux-2.6-xfs.shrink/fs/xfs/xfs_fsops.c
--- linux-2.6-xfs.cvs-orig/fs/xfs/xfs_fsops.c 2007-06-09 18:56:21.509308225 +0200
+++ linux-2.6-xfs.shrink/fs/xfs/xfs_fsops.c 2007-06-10 18:32:36.074856477 +0200
@@ -112,6 +112,53 @@
return 0;
}
+static void xfs_update_sb(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_agnumber_t nagimax,
+ xfs_agnumber_t nagcount) /* new number of a.g. */
+{
+ xfs_agnumber_t agno;
+ xfs_buf_t *bp;
+ xfs_sb_t *sbp;
+ int error;
+
+ /* New allocation groups fully initialized, so update mount struct */
+ if (nagimax)
+ mp->m_maxagi = nagimax;
+ if (mp->m_sb.sb_imax_pct) {
+ __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+ do_div(icount, 100);
+ mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+ } else
+ mp->m_maxicount = 0;
+ for (agno = 1; agno < nagcount; agno++) {
+ error = xfs_read_buf(mp, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error) {
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "error %d reading secondary superblock for ag %d",
+ error, agno);
+ break;
+ }
+ sbp = XFS_BUF_TO_SBP(bp);
+ xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
+ /*
+ * If we get an error writing out the alternate superblocks,
+ * just issue a warning and continue. The real work is
+ * already done and committed.
+ */
+ if (!(error = xfs_bwrite(mp, bp))) {
+ continue;
+ } else {
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "write error %d updating secondary superblock for ag %d",
+ error, agno);
+ break; /* no point in continuing */
+ }
+ }
+}
+
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
@@ -135,7 +182,6 @@
xfs_rfsblock_t nfree;
xfs_agnumber_t oagcount;
int pct;
- xfs_sb_t *sbp;
xfs_trans_t *tp;
nb = in->newblocks;
@@ -356,44 +402,228 @@
if (error) {
return error;
}
- /* New allocation groups fully initialized, so update mount struct */
- if (nagimax)
- mp->m_maxagi = nagimax;
- if (mp->m_sb.sb_imax_pct) {
- __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
- do_div(icount, 100);
- mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
- } else
- mp->m_maxicount = 0;
- for (agno = 1; agno < nagcount; agno++) {
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ xfs_update_sb(mp, nagimax, nagcount);
+ return 0;
+
+ error0:
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ return error;
+}
+
+static int
+xfs_shrinkfs_data_private(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_growfs_data_t *in) /* growfs data input struct */
+{
+ xfs_agf_t *agf;
+ xfs_agnumber_t agno;
+ xfs_buf_t *bp;
+ int dpct;
+ int error;
+ xfs_agnumber_t nagcount; /* new AG count */
+ xfs_agnumber_t oagcount; /* old AG count */
+ xfs_agnumber_t nagimax = 0;
+ xfs_rfsblock_t nb, nb_mod;
+ xfs_rfsblock_t dbdelta; /* will be used as a
+ check that we
+ shrink the fs by
+ the correct number
+ of blocks */
+ xfs_rfsblock_t fdbdelta; /* will keep track of
+ how many ag blocks
+ we need to
+ remove */
+ int pct;
+ xfs_trans_t *tp;
+
+ nb = in->newblocks;
+ pct = in->imaxpct;
+ if (nb >= mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+ return XFS_ERROR(EINVAL);
+ dpct = pct - mp->m_sb.sb_imax_pct;
+ error = xfs_read_buf(mp, mp->m_ddev_targp,
+ XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error)
+ return error;
+ ASSERT(bp);
+ /* FIXME: we release the buffer here manually because we are
+ * outside of a transaction? The other buffers read using the
+ * functions which take a tp parameter are not released in
+ * growfs
+ */
+ xfs_buf_relse(bp);
+
+ /* Do basic checks (at the fs level) */
+ oagcount = mp->m_sb.sb_agcount;
+ nagcount = nb;
+ nb_mod = do_div(nagcount, mp->m_sb.sb_agblocks);
+ if(nb_mod) {
+ printk("not shrinking on an AG boundary (diff=%d)\n", nb_mod);
+ return XFS_ERROR(ENOSPC);
+ }
+ if(nagcount < 2) {
+ printk("refusing to shrink below 2 AGs\n");
+ return XFS_ERROR(ENOSPC);
+ }
+ if(nagcount >= oagcount) {
+ printk("number of AGs will not decrease\n");
+ return XFS_ERROR(EINVAL);
+ }
+ printk("Cur ag=%d, cur blocks=%llu\n",
+ mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
+ printk("New ag=%d, new blocks=%d\n", nagcount, nb);
+
+ printk("Will resize from %llu to %d, delta is %llu\n",
+ mp->m_sb.sb_dblocks, nb, mp->m_sb.sb_dblocks - nb);
+ /* Check to see if we trip over the log section */
+ printk("logstart=%llu logblocks=%u\n",
+ mp->m_sb.sb_logstart, mp->m_sb.sb_logblocks);
+ if (nb < mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks)
+ return XFS_ERROR(EINVAL);
+ /* dbdelta starts at the diff and must become zero */
+ dbdelta = mp->m_sb.sb_dblocks - nb;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+ printk("reserving %d\n", XFS_GROWFS_SPACE_RES(mp) + dbdelta);
+ if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp) + dbdelta,
+ XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ fdbdelta = 0;
+
+ /* Per-AG checks */
+ /* FIXME: do we need to hold m_peraglock while doing this? */
+ /* I think that since we do read and write to the m_perag
+ * stuff, we should be holding the lock for the entire walk &
+ * modify of the fs
+ */
+ /* Note that because we hold the lock, on any error+early
+ * return, we must either release manually and return, or
+ * jump to error0
+ */
+ down_write(&mp->m_peraglock);
+ for(agno = oagcount - 1; agno >= nagcount; agno--) {
+ xfs_extlen_t usedblks; /* total used blocks in this a.g. */
+ xfs_extlen_t freeblks; /* free blocks in this a.g. */
+ xfs_agblock_t aglen; /* this ag's len */
+ struct xfs_perag *pag; /* the m_perag structure */
+
+ printk("doing agno=%d\n", agno);
+
+ pag = &mp->m_perag[agno];
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
if (error) {
- xfs_fs_cmn_err(CE_WARN, mp,
- "error %d reading secondary superblock for ag %d",
- error, agno);
- break;
+ goto error0;
}
- sbp = XFS_BUF_TO_SBP(bp);
- xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
+ ASSERT(bp);
+ agf = XFS_BUF_TO_AGF(bp);
+ aglen = INT_GET(agf->agf_length, ARCH_CONVERT);
+
+ /* read the pagf/pagi if not already initialized */
+ /* agf should be initialized because of the ablove read_agf */
+ ASSERT(pag->pagf_init);
+ if (!pag->pagi_init) {
+ if ((error = xfs_ialloc_read_agi(mp, tp, agno, &bp)))
+ goto error0;
+ ASSERT(pag->pagi_init);
+ }
+
/*
- * If we get an error writing out the alternate superblocks,
- * just issue a warning and continue. The real work is
- * already done and committed.
+ * Check the inodes: as long as we have pagi_count ==
+ * pagi_freecount == 0, then: a) we don't have to
+ * update any global inode counters, and b) there are
+ * no extra blocks in inode btrees
*/
- if (!(error = xfs_bwrite(mp, bp))) {
- continue;
- } else {
- xfs_fs_cmn_err(CE_WARN, mp,
- "write error %d updating secondary superblock for ag %d",
- error, agno);
- break; /* no point in continuing */
+ if(pag->pagi_count > 0 ||
+ pag->pagi_freecount > 0) {
+ printk("agi %d has %d inodes in total and %d free\n",
+ agno, pag->pagi_count, pag->pagi_freecount);
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
+ }
+
+ /* Check the AGF: if levels[] == 1, then there should
+ * be no extra blocks in the btrees beyond the ones
+ * at the beggining of the AG
+ */
+ if(pag->pagf_levels[XFS_BTNUM_BNOi] > 1 ||
+ pag->pagf_levels[XFS_BTNUM_CNTi] > 1) {
+ printk("agf %d has level %d bt and %d cnt\n",
+ agno,
+ pag->pagf_levels[XFS_BTNUM_BNOi],
+ pag->pagf_levels[XFS_BTNUM_CNTi]);
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
}
+
+ freeblks = pag->pagf_freeblks;
+ printk("Usage: %d prealloc, %d flcount\n",
+ XFS_PREALLOC_BLOCKS(mp), pag->pagf_flcount);
+
+ /* Done gathering data, check sizes */
+ usedblks = XFS_PREALLOC_BLOCKS(mp) + pag->pagf_flcount;
+ printk("agno=%d agf_length=%d computed used=%d"
+ " known free=%d\n", agno, aglen, usedblks, freeblks);
+
+ if(usedblks + freeblks != aglen) {
+ printk("agno %d is not free (%d blocks allocated)\n",
+ agno, aglen-usedblks-freeblks);
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
+ }
+ dbdelta -= aglen;
+ printk("will lower with %d\n",
+ aglen - XFS_PREALLOC_BLOCKS(mp));
+ fdbdelta += aglen - XFS_PREALLOC_BLOCKS(mp);
+ }
+ /*
+ * Check that we removed all blocks
+ */
+ ASSERT(!dbdelta);
+ ASSERT(nagcount < oagcount);
+
+ printk("to free: %d, oagcount=%d, nagcount=%d\n",
+ fdbdelta, oagcount, nagcount);
+
+ xfs_trans_agblocks_delta(tp, -((long)fdbdelta));
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, nb - mp->m_sb.sb_dblocks);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, -((int64_t)fdbdelta));
+
+ if (dpct)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+ error = xfs_trans_commit(tp, 0);
+ if (error) {
+ up_write(&mp->m_peraglock);
+ return error;
}
+ /* Free memory as the number of AG has changed */
+ for (agno = nagcount; agno < oagcount; agno++)
+ if (mp->m_perag[agno].pagb_list)
+ kmem_free(mp->m_perag[agno].pagb_list,
+ sizeof(xfs_perag_busy_t) *
+ XFS_PAGB_NUM_SLOTS);
+
+ mp->m_perag = kmem_realloc(mp->m_perag,
+ sizeof(xfs_perag_t) * nagcount,
+ sizeof(xfs_perag_t) * oagcount,
+ KM_SLEEP);
+ /* FIXME: here we could instead just lower
+ * nagimax to nagcount; is it better this way?
+ */
+ /* FIXME: why is this flag unconditionally set in growfs? */
+ mp->m_flags |= XFS_MOUNT_32BITINODES;
+ nagimax = xfs_initialize_perag(XFS_MTOVFS(mp), mp, nagcount);
+ up_write(&mp->m_peraglock);
+
+ xfs_update_sb(mp, nagimax, nagcount);
return 0;
error0:
+ up_write(&mp->m_peraglock);
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
return error;
}
@@ -435,7 +665,10 @@
int error;
if (!cpsema(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
- error = xfs_growfs_data_private(mp, in);
+ if(in->newblocks < mp->m_sb.sb_dblocks)
+ error = xfs_shrinkfs_data_private(mp, in);
+ else
+ error = xfs_growfs_data_private(mp, in);
vsema(&mp->m_growlock);
return error;
}
@@ -633,7 +866,7 @@
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
thaw_bdev(sb->s_bdev, sb);
}
-
+
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
diff -X ignore -urN linux-2.6-xfs.cvs-orig/fs/xfs/xfs_trans.c linux-2.6-xfs.shrink/fs/xfs/xfs_trans.c
--- linux-2.6-xfs.cvs-orig/fs/xfs/xfs_trans.c 2007-06-05 17:40:51.000000000 +0200
+++ linux-2.6-xfs.shrink/fs/xfs/xfs_trans.c 2007-06-07 23:01:03.000000000 +0200
@@ -503,11 +503,9 @@
tp->t_res_frextents_delta += delta;
break;
case XFS_TRANS_SB_DBLOCKS:
- ASSERT(delta > 0);
tp->t_dblocks_delta += delta;
break;
case XFS_TRANS_SB_AGCOUNT:
- ASSERT(delta > 0);
tp->t_agcount_delta += delta;
break;
case XFS_TRANS_SB_IMAXPCT:
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
next reply other threads:[~2007-06-10 16:40 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-10 16:40 Iustin Pop [this message]
2007-06-12 2:40 ` [PATCH] Implement shrink of empty AGs David Chinner
2007-06-12 4:25 ` Eric Sandeen
2007-06-14 6:01 ` Iustin Pop
2007-06-14 9:00 ` David Chinner
2007-06-14 20:55 ` Iustin Pop
2007-06-14 22:16 ` David Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070610164014.GA10936@teal.hq.k1024.org \
--to=iusty@k1024.org \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox