* [PATCH] xfs_repair: multithread phase 2
@ 2011-01-10 0:44 Dave Chinner
2011-01-10 7:57 ` Michael Monnerie
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Dave Chinner @ 2011-01-10 0:44 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Running some recent repair tests on broken filesystem meant running
phase 1 and 2 repeatedly to reproduce an issue at the start of phase
3. Phase 2 was taking approximately 10 minutes to run as it
processes each AG serially.
Phase 2 can be trivially parallelised - it is simply scanning the
per AG trees to calculate free block counts and free and used inodes
counts. This can be done safely in parallel by giving each AG it's
own structure to aggregate counts into, then once the AG scan is
complete adding them all together.
This patch uses 32-way threading which results in no noticable
slowdown on single SATA drives with NCQ, but results in ~10x
reduction in runtime on a 12 disk RAID-0 array.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
repair/phase2.c | 20 +---
repair/scan.c | 294 +++++++++++++++++++++++++++++----------------------
repair/scan.h | 39 +------
repair/xfs_repair.c | 10 ++-
4 files changed, 182 insertions(+), 181 deletions(-)
diff --git a/repair/phase2.c b/repair/phase2.c
index e81ebf0..1e9377e 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -24,10 +24,9 @@
#include "err_protos.h"
#include "incore.h"
#include "progress.h"
+#include "scan.h"
void set_mp(xfs_mount_t *mpp);
-void scan_ag(xfs_agnumber_t agno);
-void validate_sb(struct xfs_sb *sb);
/* workaround craziness in the xlog routines */
int xlog_recover_do_trans(xlog_t *log, xlog_recover_t *t, int p) { return 0; }
@@ -107,9 +106,10 @@ zero_log(xfs_mount_t *mp)
*/
void
-phase2(xfs_mount_t *mp)
+phase2(
+ struct xfs_mount *mp,
+ int scan_threads)
{
- xfs_agnumber_t i;
int j;
ino_tree_node_t *ino_rec;
@@ -138,17 +138,7 @@ phase2(xfs_mount_t *mp)
set_progress_msg(PROG_FMT_SCAN_AG, (__uint64_t) glob_agcount);
- for (i = 0; i < mp->m_sb.sb_agcount; i++) {
- scan_ag(i);
-#ifdef XR_INODE_TRACE
- print_inode_list(i);
-#endif
- }
-
- /*
- * Validate that our manual counts match the superblock.
- */
- validate_sb(&mp->m_sb);
+ scan_ags(mp, scan_threads);
print_final_rpt();
diff --git a/repair/scan.c b/repair/scan.c
index 85017ff..afed693 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -28,6 +28,7 @@
#include "versions.h"
#include "bmap.h"
#include "progress.h"
+#include "threads.h"
extern int verify_set_agheader(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
xfs_agf_t *agf, xfs_agi_t *agi, xfs_agnumber_t i);
@@ -35,27 +36,31 @@ extern int verify_set_agheader(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
static xfs_mount_t *mp = NULL;
/*
- * Global variables to validate superblock values against the manual count
+ * Variables to validate AG header values against the manual count
* from the btree traversal.
- *
- * No locking for now as phase2 is not threaded.
*/
-static __uint64_t fdblocks;
-static __uint64_t icount;
-static __uint64_t ifreecount;
+struct aghdr_cnts {
+ xfs_agnumber_t agno;
+ xfs_extlen_t agffreeblks;
+ xfs_extlen_t agflongest;
+ __uint64_t agfbtreeblks;
+ __uint32_t agicount;
+ __uint32_t agifreecount;
+ __uint64_t fdblocks;
+ __uint64_t icount;
+ __uint64_t ifreecount;
+};
-/*
- * Global variables to validate AG header values against the manual count
- * from the btree traversal.
- *
- * Note: these values must be reset when processing a new AG, and for now
- * forces the AG scanning in phase2 to not be threaded.
- */
-static xfs_extlen_t agffreeblks;
-static xfs_extlen_t agflongest;
-static __uint64_t agfbtreeblks;
-static __uint32_t agicount;
-static __uint32_t agifreecount;
+static void
+scanfunc_allocbt(
+ struct xfs_btree_block *block,
+ int level,
+ xfs_agblock_t bno,
+ xfs_agnumber_t agno,
+ int suspect,
+ int isroot,
+ __uint32_t magic,
+ struct aghdr_cnts *agcnts);
void
set_mp(xfs_mount_t *mpp)
@@ -75,8 +80,10 @@ scan_sbtree(
xfs_agblock_t bno,
xfs_agnumber_t agno,
int suspect,
- int isroot),
- int isroot)
+ int isroot,
+ void *priv),
+ int isroot,
+ void *priv)
{
xfs_buf_t *bp;
@@ -86,7 +93,8 @@ scan_sbtree(
do_error(_("can't read btree block %d/%d\n"), agno, root);
return;
}
- (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1, root, agno, suspect, isroot);
+ (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1, root, agno, suspect,
+ isroot, priv);
libxfs_putbuf(bp);
}
@@ -468,7 +476,35 @@ _("out-of-order bmap key (file offset) in inode %llu, %s fork, fsbno %llu\n"),
return(0);
}
-void
+static void
+scanfunc_bno(
+ struct xfs_btree_block *block,
+ int level,
+ xfs_agblock_t bno,
+ xfs_agnumber_t agno,
+ int suspect,
+ int isroot,
+ void *agcnts)
+{
+ return scanfunc_allocbt(block, level, bno, agno,
+ suspect, isroot, XFS_ABTB_MAGIC, agcnts);
+}
+
+static void
+scanfunc_cnt(
+ struct xfs_btree_block *block,
+ int level,
+ xfs_agblock_t bno,
+ xfs_agnumber_t agno,
+ int suspect,
+ int isroot,
+ void *agcnts)
+{
+ return scanfunc_allocbt(block, level, bno, agno,
+ suspect, isroot, XFS_ABTC_MAGIC, agcnts);
+}
+
+static void
scanfunc_allocbt(
struct xfs_btree_block *block,
int level,
@@ -476,7 +512,8 @@ scanfunc_allocbt(
xfs_agnumber_t agno,
int suspect,
int isroot,
- __uint32_t magic)
+ __uint32_t magic,
+ struct aghdr_cnts *agcnts)
{
const char *name;
int i;
@@ -506,8 +543,8 @@ scanfunc_allocbt(
* free data block counter.
*/
if (!isroot) {
- agfbtreeblks++;
- fdblocks++;
+ agcnts->agfbtreeblks++;
+ agcnts->fdblocks++;
}
if (be16_to_cpu(block->bb_level) != level) {
@@ -583,10 +620,10 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
lastblock = b;
}
} else {
- fdblocks += len;
- agffreeblks += len;
- if (len > agflongest)
- agflongest = len;
+ agcnts->fdblocks += len;
+ agcnts->agffreeblks += len;
+ if (len > agcnts->agflongest)
+ agcnts->agflongest = len;
if (len < lastcount) {
do_warn(_(
"out-of-order cnt btree record %d (%u %u) block %u/%u\n"),
@@ -670,38 +707,12 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, level, agno, suspect,
(magic == XFS_ABTB_MAGIC) ?
- scanfunc_bno : scanfunc_cnt, 0);
+ scanfunc_bno : scanfunc_cnt, 0,
+ (void *)agcnts);
}
}
}
-void
-scanfunc_bno(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot)
-{
- return scanfunc_allocbt(block, level, bno, agno,
- suspect, isroot, XFS_ABTB_MAGIC);
-}
-
-void
-scanfunc_cnt(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot
- )
-{
- return scanfunc_allocbt(block, level, bno, agno,
- suspect, isroot, XFS_ABTC_MAGIC);
-}
-
static int
scan_single_ino_chunk(
xfs_agnumber_t agno,
@@ -879,16 +890,17 @@ _("inode rec for ino %llu (%d/%d) overlaps existing rec (start %d/%d)\n"),
* get the start and alignment of the inode chunks right. Those chunks
* that we aren't sure about go into the uncertain list.
*/
-void
+static void
scanfunc_ino(
struct xfs_btree_block *block,
int level,
xfs_agblock_t bno,
xfs_agnumber_t agno,
int suspect,
- int isroot
- )
+ int isroot,
+ void *priv)
{
+ struct aghdr_cnts *agcnts = priv;
int i;
int numrecs;
int state;
@@ -968,10 +980,10 @@ _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
* the block. skip processing of bogus records.
*/
for (i = 0; i < numrecs; i++) {
- agicount += XFS_INODES_PER_CHUNK;
- icount += XFS_INODES_PER_CHUNK;
- agifreecount += be32_to_cpu(rp[i].ir_freecount);
- ifreecount += be32_to_cpu(rp[i].ir_freecount);
+ agcnts->agicount += XFS_INODES_PER_CHUNK;
+ agcnts->icount += XFS_INODES_PER_CHUNK;
+ agcnts->agifreecount += be32_to_cpu(rp[i].ir_freecount);
+ agcnts->ifreecount += be32_to_cpu(rp[i].ir_freecount);
suspect = scan_single_ino_chunk(agno, &rp[i], suspect);
}
@@ -1015,13 +1027,14 @@ _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
be32_to_cpu(pp[i])))
scan_sbtree(be32_to_cpu(pp[i]), level, agno,
- suspect, scanfunc_ino, 0);
+ suspect, scanfunc_ino, 0, priv);
}
}
-void
+static void
scan_freelist(
- xfs_agf_t *agf)
+ xfs_agf_t *agf,
+ struct aghdr_cnts *agcnts)
{
xfs_agfl_t *agfl;
xfs_buf_t *agflbuf;
@@ -1068,7 +1081,7 @@ scan_freelist(
be32_to_cpu(agf->agf_flcount), agno);
}
- fdblocks += count;
+ agcnts->fdblocks += count;
libxfs_putbuf(agflbuf);
}
@@ -1076,14 +1089,15 @@ scan_freelist(
static void
validate_agf(
struct xfs_agf *agf,
- xfs_agnumber_t agno)
+ xfs_agnumber_t agno,
+ struct aghdr_cnts *agcnts)
{
xfs_agblock_t bno;
bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
- agno, 0, scanfunc_bno, 1);
+ agno, 0, scanfunc_bno, 1, agcnts);
} else {
do_warn(_("bad agbno %u for btbno root, agno %d\n"),
bno, agno);
@@ -1092,33 +1106,34 @@ validate_agf(
bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
- agno, 0, scanfunc_cnt, 1);
+ agno, 0, scanfunc_cnt, 1, agcnts);
} else {
do_warn(_("bad agbno %u for btbcnt root, agno %d\n"),
bno, agno);
}
- if (be32_to_cpu(agf->agf_freeblks) != agffreeblks) {
+ if (be32_to_cpu(agf->agf_freeblks) != agcnts->agffreeblks) {
do_warn(_("agf_freeblks %u, counted %u in ag %u\n"),
- be32_to_cpu(agf->agf_freeblks), agffreeblks, agno);
+ be32_to_cpu(agf->agf_freeblks), agcnts->agffreeblks, agno);
}
- if (be32_to_cpu(agf->agf_longest) != agflongest) {
+ if (be32_to_cpu(agf->agf_longest) != agcnts->agflongest) {
do_warn(_("agf_longest %u, counted %u in ag %u\n"),
- be32_to_cpu(agf->agf_longest), agflongest, agno);
+ be32_to_cpu(agf->agf_longest), agcnts->agflongest, agno);
}
if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
- be32_to_cpu(agf->agf_btreeblks) != agfbtreeblks) {
+ be32_to_cpu(agf->agf_btreeblks) != agcnts->agfbtreeblks) {
do_warn(_("agf_btreeblks %u, counted %u in ag %u\n"),
- be32_to_cpu(agf->agf_btreeblks), agfbtreeblks, agno);
+ be32_to_cpu(agf->agf_btreeblks), agcnts->agfbtreeblks, agno);
}
}
static void
validate_agi(
struct xfs_agi *agi,
- xfs_agnumber_t agno)
+ xfs_agnumber_t agno,
+ struct aghdr_cnts *agcnts)
{
xfs_agblock_t bno;
int i;
@@ -1126,20 +1141,20 @@ validate_agi(
bno = be32_to_cpu(agi->agi_root);
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, be32_to_cpu(agi->agi_level),
- agno, 0, scanfunc_ino, 1);
+ agno, 0, scanfunc_ino, 1, agcnts);
} else {
do_warn(_("bad agbno %u for inobt root, agno %d\n"),
be32_to_cpu(agi->agi_root), agno);
}
- if (be32_to_cpu(agi->agi_count) != agicount) {
+ if (be32_to_cpu(agi->agi_count) != agcnts->agicount) {
do_warn(_("agi_count %u, counted %u in ag %u\n"),
- be32_to_cpu(agi->agi_count), agicount, agno);
+ be32_to_cpu(agi->agi_count), agcnts->agicount, agno);
}
- if (be32_to_cpu(agi->agi_freecount) != agifreecount) {
+ if (be32_to_cpu(agi->agi_freecount) != agcnts->agifreecount) {
do_warn(_("agi_freecount %u, counted %u in ag %u\n"),
- be32_to_cpu(agi->agi_freecount), agifreecount, agno);
+ be32_to_cpu(agi->agi_freecount), agcnts->agifreecount, agno);
}
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
@@ -1155,42 +1170,15 @@ validate_agi(
}
/*
- * Validate block/inode counts in the superblock.
- *
- * Note: needs to be called after scan_ag() has been called for all
- * allocation groups.
- */
-void
-validate_sb(
- struct xfs_sb *sb)
-{
- if (sb->sb_icount != icount) {
- do_warn(_("sb_icount %lld, counted %lld\n"),
- sb->sb_icount, icount);
- }
-
- if (sb->sb_ifree != ifreecount) {
- do_warn(_("sb_ifree %lld, counted %lld\n"),
- sb->sb_ifree, ifreecount);
- }
-
- if (sb->sb_fdblocks != fdblocks) {
- do_warn(_("sb_fdblocks %lld, counted %lld\n"),
- sb->sb_fdblocks, fdblocks);
- }
-
- /* XXX: check sb_frextents */
-}
-
-/*
* Scan an AG for obvious corruption.
- *
- * Note: This code is not reentrant due to the use of global variables.
*/
-void
+static void
scan_ag(
- xfs_agnumber_t agno)
+ work_queue_t *wq,
+ xfs_agnumber_t agno,
+ void *arg)
{
+ struct aghdr_cnts *agcnts = arg;
xfs_agf_t *agf;
xfs_buf_t *agfbuf;
int agf_dirty = 0;
@@ -1202,16 +1190,6 @@ scan_ag(
int sb_dirty = 0;
int status;
- /*
- * Reset the global variables to track the AG header validity.
- *
- * Because we use global variable but can get called multiple times
- * we have to make sure to always reset these variables.
- */
- agicount = agifreecount = 0;
- agffreeblks = agfbtreeblks = 0;
- agflongest = 0;
-
sbbuf = libxfs_readbuf(mp->m_dev, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
XFS_FSS_TO_BB(mp, 1), 0);
if (!sbbuf) {
@@ -1301,10 +1279,10 @@ scan_ag(
return;
}
- scan_freelist(agf);
+ scan_freelist(agf, agcnts);
- validate_agf(agf, agno);
- validate_agi(agi, agno);
+ validate_agf(agf, agno, agcnts);
+ validate_agi(agi, agno, agcnts);
ASSERT(agi_dirty == 0 || (agi_dirty && !no_modify));
@@ -1331,4 +1309,64 @@ scan_ag(
libxfs_putbuf(sbbuf);
free(sb);
PROG_RPT_INC(prog_rpt_done[agno], 1);
+
+#ifdef XR_INODE_TRACE
+ print_inode_list(i);
+#endif
+ return;
}
+
+#define SCAN_THREADS 32
+
+void
+scan_ags(
+ struct xfs_mount *mp,
+ int scan_threads)
+{
+ struct aghdr_cnts *agcnts;
+ __uint64_t fdblocks = 0;
+ __uint64_t icount = 0;
+ __uint64_t ifreecount = 0;
+ xfs_agnumber_t i;
+ work_queue_t wq;
+
+ agcnts = malloc(mp->m_sb.sb_agcount * sizeof(*agcnts));
+ if (!agcnts) {
+ do_abort(_("no memory for ag header counts\n"));
+ return;
+ }
+ memset(agcnts, 0, mp->m_sb.sb_agcount * sizeof(*agcnts));
+
+ create_work_queue(&wq, mp, scan_threads);
+
+ for (i = 0; i < mp->m_sb.sb_agcount; i++)
+ queue_work(&wq, scan_ag, i, &agcnts[i]);
+
+ destroy_work_queue(&wq);
+
+ /* tally up the counts */
+ for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+ fdblocks += agcnts[i].fdblocks;
+ icount += agcnts[i].icount;
+ ifreecount += agcnts[i].ifreecount;
+ }
+
+ /*
+ * Validate that our manual counts match the superblock.
+ */
+ if (mp->m_sb.sb_icount != icount) {
+ do_warn(_("sb_icount %lld, counted %lld\n"),
+ mp->m_sb.sb_icount, icount);
+ }
+
+ if (mp->m_sb.sb_ifree != ifreecount) {
+ do_warn(_("sb_ifree %lld, counted %lld\n"),
+ mp->m_sb.sb_ifree, ifreecount);
+ }
+
+ if (mp->m_sb.sb_fdblocks != fdblocks) {
+ do_warn(_("sb_fdblocks %lld, counted %lld\n"),
+ mp->m_sb.sb_fdblocks, fdblocks);
+ }
+}
+
diff --git a/repair/scan.h b/repair/scan.h
index 20567fb..9f945cf 100644
--- a/repair/scan.h
+++ b/repair/scan.h
@@ -20,19 +20,6 @@
struct blkmap;
-void scan_sbtree(
- xfs_agblock_t root,
- int nlevels,
- xfs_agnumber_t agno,
- int suspect,
- void (*func)(struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot),
- int isroot);
-
int scan_lbtree(
xfs_dfsbno_t root,
int nlevels,
@@ -74,29 +61,9 @@ int scanfunc_bmap(
int check_dups,
int *dirty);
-void scanfunc_bno(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot);
-
-void scanfunc_cnt(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot);
-
void
-scanfunc_ino(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot);
+scan_ags(
+ struct xfs_mount *mp,
+ int scan_threads);
#endif /* _XR_SCAN_H */
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index 1d9ad46..4707b83 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -33,7 +33,7 @@
#define rounddown(x, y) (((x)/(y))*(y))
extern void phase1(xfs_mount_t *);
-extern void phase2(xfs_mount_t *);
+extern void phase2(xfs_mount_t *, int);
extern void phase3(xfs_mount_t *);
extern void phase4(xfs_mount_t *);
extern void phase5(xfs_mount_t *);
@@ -63,6 +63,8 @@ char *o_opts[] = {
"ag_stride",
#define FORCE_GEO 5
"force_geometry",
+#define PHASE2_THREADS 6
+ "phase2_threads",
NULL
};
@@ -80,6 +82,7 @@ char *c_opts[] = {
static int ihash_option_used;
static int bhash_option_used;
static long max_mem_specified; /* in megabytes */
+static int phase2_threads = 32;
static void
usage(void)
@@ -266,6 +269,9 @@ process_args(int argc, char **argv)
respec('o', o_opts, FORCE_GEO);
force_geo = 1;
break;
+ case PHASE2_THREADS:
+ phase2_threads = (int)strtol(val, NULL, 0);
+ break;
default:
unknown('o', val);
break;
@@ -709,7 +715,7 @@ main(int argc, char **argv)
}
/* make sure the per-ag freespace maps are ok so we can mount the fs */
- phase2(mp);
+ phase2(mp, phase2_threads);
timestamp(PHASE_END, 2, NULL);
if (do_prefetch)
--
1.7.2.3
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 0:44 [PATCH] xfs_repair: multithread phase 2 Dave Chinner
@ 2011-01-10 7:57 ` Michael Monnerie
2011-01-10 8:41 ` Dave Chinner
2011-01-10 18:55 ` Christoph Hellwig
2011-02-01 23:39 ` Alex Elder
2 siblings, 1 reply; 13+ messages in thread
From: Michael Monnerie @ 2011-01-10 7:57 UTC (permalink / raw)
To: xfs
[-- Attachment #1.1: Type: Text/Plain, Size: 714 bytes --]
On Montag, 10. Januar 2011 Dave Chinner wrote:
> This patch uses 32-way threading which results in no noticable
> slowdown on single SATA drives with NCQ, but results in ~10x
> reduction in runtime on a 12 disk RAID-0 array.
Is the fixed 32-way number reasonable, or shouldn't that be "number of
available cpu cores"-way? Why threading when you have a single core cpu?
--
mit freundlichen Grüssen,
Michael Monnerie, Ing. BSc
it-management Internet Services: Protéger
http://proteger.at [gesprochen: Prot-e-schee]
Tel: +43 660 / 415 6531
// ****** Radiointerview zum Thema Spam ******
// http://www.it-podcast.at/archiv.html#podcast-100716
//
// Haus zu verkaufen: http://zmi.at/langegg/
[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 198 bytes --]
[-- Attachment #2: Type: text/plain, Size: 121 bytes --]
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 7:57 ` Michael Monnerie
@ 2011-01-10 8:41 ` Dave Chinner
2011-01-10 13:25 ` Michael Monnerie
2011-01-10 19:17 ` Christoph Hellwig
0 siblings, 2 replies; 13+ messages in thread
From: Dave Chinner @ 2011-01-10 8:41 UTC (permalink / raw)
To: Michael Monnerie; +Cc: xfs
On Mon, Jan 10, 2011 at 08:57:52AM +0100, Michael Monnerie wrote:
> On Montag, 10. Januar 2011 Dave Chinner wrote:
> > This patch uses 32-way threading which results in no noticable
> > slowdown on single SATA drives with NCQ, but results in ~10x
> > reduction in runtime on a 12 disk RAID-0 array.
>
> Is the fixed 32-way number reasonable, or shouldn't that be "number of
> available cpu cores"-way? Why threading when you have a single core cpu?
Sure, 32-way is reasonable on a single disk and CPU. Pretty much
every sata disk supports NCQ these days, and default to a depth of
32, which means we can have 32 concurrent reads in progress at once.
Phase 2 is all synchronous IO, so the only way to hide the IO
latency is to queue work to multiple threads and switch between the
threadsto work on another queue when the current one blocks waiting
for IO.
Basically the threading being used to drive IO level concurrency,
not to drive CPU level concurrency - the total CPU usage of phase 2
is less than what even a slow CPU can provide, so to keep it busy we
need lots of concurrent IO streams in progress at once...
And if you want to change it, there's a command line option for it.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 8:41 ` Dave Chinner
@ 2011-01-10 13:25 ` Michael Monnerie
2011-01-10 19:18 ` Christoph Hellwig
2011-01-10 19:17 ` Christoph Hellwig
1 sibling, 1 reply; 13+ messages in thread
From: Michael Monnerie @ 2011-01-10 13:25 UTC (permalink / raw)
To: xfs
[-- Attachment #1.1: Type: Text/Plain, Size: 1293 bytes --]
On Montag, 10. Januar 2011 Dave Chinner wrote:
> Pretty much
> every sata disk supports NCQ these days, and default to a depth of
> 32, which means we can have 32 concurrent reads in progress at once.
> Phase 2 is all synchronous IO, so the only way to hide the IO
> latency is to queue work to multiple threads and switch between the
> threadsto work on another queue when the current one blocks waiting
> for IO.
This is interesting. Did you measure this with a rotating single disk?
Is the idle time between two synchronous reads bigger than the time
needed to move the disk head to another cylinder and read a sector? That
takes ~15ms on a normal disk, incredibly long compared to cpu speed.
Even with NCQ, the disk would have to swing the head a lot, and just
from thinking about it I wouldn't believe that it's faster like this.
But I'm sure you tested it so I take it as given that it's like that.
Cool improvement, btw :-)
--
mit freundlichen Grüssen,
Michael Monnerie, Ing. BSc
it-management Internet Services: Protéger
http://proteger.at [gesprochen: Prot-e-schee]
Tel: +43 660 / 415 6531
// ****** Radiointerview zum Thema Spam ******
// http://www.it-podcast.at/archiv.html#podcast-100716
//
// Haus zu verkaufen: http://zmi.at/langegg/
[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 198 bytes --]
[-- Attachment #2: Type: text/plain, Size: 121 bytes --]
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 13:25 ` Michael Monnerie
@ 2011-01-10 19:18 ` Christoph Hellwig
0 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2011-01-10 19:18 UTC (permalink / raw)
To: Michael Monnerie; +Cc: xfs
On Mon, Jan 10, 2011 at 02:25:47PM +0100, Michael Monnerie wrote:
> On Montag, 10. Januar 2011 Dave Chinner wrote:
> > Pretty much
> > every sata disk supports NCQ these days, and default to a depth of
> > 32, which means we can have 32 concurrent reads in progress at once.
> > Phase 2 is all synchronous IO, so the only way to hide the IO
> > latency is to queue work to multiple threads and switch between the
> > threadsto work on another queue when the current one blocks waiting
> > for IO.
>
> This is interesting. Did you measure this with a rotating single disk?
Take a look at the patch description.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 8:41 ` Dave Chinner
2011-01-10 13:25 ` Michael Monnerie
@ 2011-01-10 19:17 ` Christoph Hellwig
2011-01-10 21:53 ` Matthias Schniedermeyer
1 sibling, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2011-01-10 19:17 UTC (permalink / raw)
To: Dave Chinner; +Cc: Michael Monnerie, xfs
On Mon, Jan 10, 2011 at 07:41:22PM +1100, Dave Chinner wrote:
> On Mon, Jan 10, 2011 at 08:57:52AM +0100, Michael Monnerie wrote:
> > On Montag, 10. Januar 2011 Dave Chinner wrote:
> > > This patch uses 32-way threading which results in no noticable
> > > slowdown on single SATA drives with NCQ, but results in ~10x
> > > reduction in runtime on a 12 disk RAID-0 array.
> >
> > Is the fixed 32-way number reasonable, or shouldn't that be "number of
> > available cpu cores"-way? Why threading when you have a single core cpu?
>
> Sure, 32-way is reasonable on a single disk and CPU. Pretty much
> every sata disk supports NCQ these days, and default to a depth of
> 32, which means we can have 32 concurrent reads in progress at once.
> Phase 2 is all synchronous IO, so the only way to hide the IO
> latency is to queue work to multiple threads and switch between the
> threadsto work on another queue when the current one blocks waiting
> for IO.
The default queue depth for ATA NCQ actually is 31, not 32 for some odd
reason.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 19:17 ` Christoph Hellwig
@ 2011-01-10 21:53 ` Matthias Schniedermeyer
0 siblings, 0 replies; 13+ messages in thread
From: Matthias Schniedermeyer @ 2011-01-10 21:53 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Michael Monnerie, xfs
On 10.01.2011 14:17, Christoph Hellwig wrote:
> On Mon, Jan 10, 2011 at 07:41:22PM +1100, Dave Chinner wrote:
> > On Mon, Jan 10, 2011 at 08:57:52AM +0100, Michael Monnerie wrote:
> > > On Montag, 10. Januar 2011 Dave Chinner wrote:
> > > > This patch uses 32-way threading which results in no noticable
> > > > slowdown on single SATA drives with NCQ, but results in ~10x
> > > > reduction in runtime on a 12 disk RAID-0 array.
> > >
> > > Is the fixed 32-way number reasonable, or shouldn't that be "number of
> > > available cpu cores"-way? Why threading when you have a single core cpu?
> >
> > Sure, 32-way is reasonable on a single disk and CPU. Pretty much
> > every sata disk supports NCQ these days, and default to a depth of
> > 32, which means we can have 32 concurrent reads in progress at once.
> > Phase 2 is all synchronous IO, so the only way to hide the IO
> > latency is to queue work to multiple threads and switch between the
> > threadsto work on another queue when the current one blocks waiting
> > for IO.
>
> The default queue depth for ATA NCQ actually is 31, not 32 for some odd
> reason.
AFAIR the original discussion it's because Depth 0 and 32 use the same
value. And the knowledge that (HDD-)firmware tends to be buggy, so it
was decided to stay on the safe side and not use "0/32".
Bis denn
--
Real Programmers consider "what you see is what you get" to be just as
bad a concept in Text Editors as it is in women. No, the Real Programmer
wants a "you asked for it, you got it" text editor -- complicated,
cryptic, powerful, unforgiving, dangerous.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 0:44 [PATCH] xfs_repair: multithread phase 2 Dave Chinner
2011-01-10 7:57 ` Michael Monnerie
@ 2011-01-10 18:55 ` Christoph Hellwig
2011-02-01 23:39 ` Alex Elder
2 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2011-01-10 18:55 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
Looks good except for some trivial nitpicks below,
Reviewed-by: Christoph Hellwig <hch@lst.de>
Btw, your previous patch used just
"repair:" as the Subject prefix, while this one uses xfs_repair. I
don't really care about, but we should standardize on one. The more
recent usage seems to include the xfs_ prefix.
> + scanfunc_bno : scanfunc_cnt, 0,
> + (void *)agcnts);
no need for the void cast.
> +#define SCAN_THREADS 32
this is unused now.
> + agcnts = malloc(mp->m_sb.sb_agcount * sizeof(*agcnts));
> + if (!agcnts) {
> + do_abort(_("no memory for ag header counts\n"));
> + return;
> + }
> + memset(agcnts, 0, mp->m_sb.sb_agcount * sizeof(*agcnts));
this could use a calloc.
> break;
> + case PHASE2_THREADS:
> + phase2_threads = (int)strtol(val, NULL, 0);
> + break;
This option also needs to be documented in the man page. Also shouldn't
we try to handle errors from strtol? Also maybe strtoul would be a
better choice as we certainly don't want a negative number of threads.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-10 0:44 [PATCH] xfs_repair: multithread phase 2 Dave Chinner
2011-01-10 7:57 ` Michael Monnerie
2011-01-10 18:55 ` Christoph Hellwig
@ 2011-02-01 23:39 ` Alex Elder
2 siblings, 0 replies; 13+ messages in thread
From: Alex Elder @ 2011-02-01 23:39 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Mon, 2011-01-10 at 11:44 +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Running some recent repair tests on broken filesystem meant running
> phase 1 and 2 repeatedly to reproduce an issue at the start of phase
> 3. Phase 2 was taking approximately 10 minutes to run as it
> processes each AG serially.
>
> Phase 2 can be trivially parallelised - it is simply scanning the
> per AG trees to calculate free block counts and free and used inodes
> counts. This can be done safely in parallel by giving each AG it's
> own structure to aggregate counts into, then once the AG scan is
> complete adding them all together.
>
> This patch uses 32-way threading which results in no noticable
> slowdown on single SATA drives with NCQ, but results in ~10x
> reduction in runtime on a 12 disk RAID-0 array.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
This looks good. Sorry I didn't say so earlier (I signed
off on your first one).
Reviewed-by: Alex Elder <aelder@sgi.com>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH] xfs_repair: multithread phase 2
@ 2011-01-04 6:13 Dave Chinner
2011-01-04 10:02 ` Christoph Hellwig
2011-01-05 23:42 ` Alex Elder
0 siblings, 2 replies; 13+ messages in thread
From: Dave Chinner @ 2011-01-04 6:13 UTC (permalink / raw)
To: xfs
From: Dave Chinner <dchinner@redhat.com>
Running some recent repair tests on broken filesystem meant running
phase 1 and 2 repeatedly to reproduce an issue at the start of phase
3. Phase 2 was taking approximately 10 minutes to run as it
processes each AG serially.
Phase 2 can be trivially parallelised - it is simply scanning the
per AG trees to calculate free block counts and free and used inodes
counts. This can be done safely in parallel by giving each AG it's
own structure to aggregate counts into, then once the AG scan is
complete adding them all together.
This patch uses 32-way threading which results in no noticable
slowdown on single SATA drives with NCQ, but results in ~10x
reduction in runtime on a 12 disk RAID-0 array.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
repair/phase2.c | 16 +---
repair/scan.c | 303 +++++++++++++++++++++++++++++++-----------------------
repair/scan.h | 37 -------
3 files changed, 176 insertions(+), 180 deletions(-)
diff --git a/repair/phase2.c b/repair/phase2.c
index e81ebf0..2f22c51 100644
--- a/repair/phase2.c
+++ b/repair/phase2.c
@@ -26,8 +26,7 @@
#include "progress.h"
void set_mp(xfs_mount_t *mpp);
-void scan_ag(xfs_agnumber_t agno);
-void validate_sb(struct xfs_sb *sb);
+void scan_ags(struct xfs_mount *mp);
/* workaround craziness in the xlog routines */
int xlog_recover_do_trans(xlog_t *log, xlog_recover_t *t, int p) { return 0; }
@@ -109,7 +108,6 @@ zero_log(xfs_mount_t *mp)
void
phase2(xfs_mount_t *mp)
{
- xfs_agnumber_t i;
int j;
ino_tree_node_t *ino_rec;
@@ -138,17 +136,7 @@ phase2(xfs_mount_t *mp)
set_progress_msg(PROG_FMT_SCAN_AG, (__uint64_t) glob_agcount);
- for (i = 0; i < mp->m_sb.sb_agcount; i++) {
- scan_ag(i);
-#ifdef XR_INODE_TRACE
- print_inode_list(i);
-#endif
- }
-
- /*
- * Validate that our manual counts match the superblock.
- */
- validate_sb(&mp->m_sb);
+ scan_ags(mp);
print_final_rpt();
diff --git a/repair/scan.c b/repair/scan.c
index 85017ff..dd62776 100644
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -35,27 +35,32 @@ extern int verify_set_agheader(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
static xfs_mount_t *mp = NULL;
/*
- * Global variables to validate superblock values against the manual count
+ * Variables to validate AG header values against the manual count
* from the btree traversal.
- *
- * No locking for now as phase2 is not threaded.
*/
-static __uint64_t fdblocks;
-static __uint64_t icount;
-static __uint64_t ifreecount;
+struct aghdr_cnts {
+ xfs_agnumber_t agno;
+ xfs_extlen_t agffreeblks;
+ xfs_extlen_t agflongest;
+ __uint64_t agfbtreeblks;
+ __uint32_t agicount;
+ __uint32_t agifreecount;
+ __uint64_t fdblocks;
+ __uint64_t icount;
+ __uint64_t ifreecount;
+};
+
+static void
+scanfunc_allocbt(
+ struct xfs_btree_block *block,
+ int level,
+ xfs_agblock_t bno,
+ xfs_agnumber_t agno,
+ int suspect,
+ int isroot,
+ __uint32_t magic,
+ struct aghdr_cnts *agcnts);
-/*
- * Global variables to validate AG header values against the manual count
- * from the btree traversal.
- *
- * Note: these values must be reset when processing a new AG, and for now
- * forces the AG scanning in phase2 to not be threaded.
- */
-static xfs_extlen_t agffreeblks;
-static xfs_extlen_t agflongest;
-static __uint64_t agfbtreeblks;
-static __uint32_t agicount;
-static __uint32_t agifreecount;
void
set_mp(xfs_mount_t *mpp)
@@ -75,8 +80,10 @@ scan_sbtree(
xfs_agblock_t bno,
xfs_agnumber_t agno,
int suspect,
- int isroot),
- int isroot)
+ int isroot,
+ struct aghdr_cnts *agcnts),
+ int isroot,
+ struct aghdr_cnts *agcnts)
{
xfs_buf_t *bp;
@@ -86,7 +93,8 @@ scan_sbtree(
do_error(_("can't read btree block %d/%d\n"), agno, root);
return;
}
- (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1, root, agno, suspect, isroot);
+ (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1, root, agno, suspect,
+ isroot, agcnts);
libxfs_putbuf(bp);
}
@@ -469,6 +477,34 @@ _("out-of-order bmap key (file offset) in inode %llu, %s fork, fsbno %llu\n"),
}
void
+scanfunc_bno(
+ struct xfs_btree_block *block,
+ int level,
+ xfs_agblock_t bno,
+ xfs_agnumber_t agno,
+ int suspect,
+ int isroot,
+ struct aghdr_cnts *agcnts)
+{
+ return scanfunc_allocbt(block, level, bno, agno,
+ suspect, isroot, XFS_ABTB_MAGIC, agcnts);
+}
+
+void
+scanfunc_cnt(
+ struct xfs_btree_block *block,
+ int level,
+ xfs_agblock_t bno,
+ xfs_agnumber_t agno,
+ int suspect,
+ int isroot,
+ struct aghdr_cnts *agcnts)
+{
+ return scanfunc_allocbt(block, level, bno, agno,
+ suspect, isroot, XFS_ABTC_MAGIC, agcnts);
+}
+
+void
scanfunc_allocbt(
struct xfs_btree_block *block,
int level,
@@ -476,7 +512,8 @@ scanfunc_allocbt(
xfs_agnumber_t agno,
int suspect,
int isroot,
- __uint32_t magic)
+ __uint32_t magic,
+ struct aghdr_cnts *agcnts)
{
const char *name;
int i;
@@ -506,8 +543,8 @@ scanfunc_allocbt(
* free data block counter.
*/
if (!isroot) {
- agfbtreeblks++;
- fdblocks++;
+ agcnts->agfbtreeblks++;
+ agcnts->fdblocks++;
}
if (be16_to_cpu(block->bb_level) != level) {
@@ -583,10 +620,10 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
lastblock = b;
}
} else {
- fdblocks += len;
- agffreeblks += len;
- if (len > agflongest)
- agflongest = len;
+ agcnts->fdblocks += len;
+ agcnts->agffreeblks += len;
+ if (len > agcnts->agflongest)
+ agcnts->agflongest = len;
if (len < lastcount) {
do_warn(_(
"out-of-order cnt btree record %d (%u %u) block %u/%u\n"),
@@ -670,38 +707,12 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, level, agno, suspect,
(magic == XFS_ABTB_MAGIC) ?
- scanfunc_bno : scanfunc_cnt, 0);
+ scanfunc_bno : scanfunc_cnt, 0,
+ agcnts);
}
}
}
-void
-scanfunc_bno(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot)
-{
- return scanfunc_allocbt(block, level, bno, agno,
- suspect, isroot, XFS_ABTB_MAGIC);
-}
-
-void
-scanfunc_cnt(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot
- )
-{
- return scanfunc_allocbt(block, level, bno, agno,
- suspect, isroot, XFS_ABTC_MAGIC);
-}
-
static int
scan_single_ino_chunk(
xfs_agnumber_t agno,
@@ -886,8 +897,8 @@ scanfunc_ino(
xfs_agblock_t bno,
xfs_agnumber_t agno,
int suspect,
- int isroot
- )
+ int isroot,
+ struct aghdr_cnts *agcnts)
{
int i;
int numrecs;
@@ -968,10 +979,10 @@ _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
* the block. skip processing of bogus records.
*/
for (i = 0; i < numrecs; i++) {
- agicount += XFS_INODES_PER_CHUNK;
- icount += XFS_INODES_PER_CHUNK;
- agifreecount += be32_to_cpu(rp[i].ir_freecount);
- ifreecount += be32_to_cpu(rp[i].ir_freecount);
+ agcnts->agicount += XFS_INODES_PER_CHUNK;
+ agcnts->icount += XFS_INODES_PER_CHUNK;
+ agcnts->agifreecount += be32_to_cpu(rp[i].ir_freecount);
+ agcnts->ifreecount += be32_to_cpu(rp[i].ir_freecount);
suspect = scan_single_ino_chunk(agno, &rp[i], suspect);
}
@@ -1015,13 +1026,14 @@ _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"),
if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno,
be32_to_cpu(pp[i])))
scan_sbtree(be32_to_cpu(pp[i]), level, agno,
- suspect, scanfunc_ino, 0);
+ suspect, scanfunc_ino, 0, agcnts);
}
}
void
scan_freelist(
- xfs_agf_t *agf)
+ xfs_agf_t *agf,
+ struct aghdr_cnts *agcnts)
{
xfs_agfl_t *agfl;
xfs_buf_t *agflbuf;
@@ -1068,7 +1080,7 @@ scan_freelist(
be32_to_cpu(agf->agf_flcount), agno);
}
- fdblocks += count;
+ agcnts->fdblocks += count;
libxfs_putbuf(agflbuf);
}
@@ -1076,14 +1088,15 @@ scan_freelist(
static void
validate_agf(
struct xfs_agf *agf,
- xfs_agnumber_t agno)
+ xfs_agnumber_t agno,
+ struct aghdr_cnts *agcnts)
{
xfs_agblock_t bno;
bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
- agno, 0, scanfunc_bno, 1);
+ agno, 0, scanfunc_bno, 1, agcnts);
} else {
do_warn(_("bad agbno %u for btbno root, agno %d\n"),
bno, agno);
@@ -1092,33 +1105,34 @@ validate_agf(
bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
- agno, 0, scanfunc_cnt, 1);
+ agno, 0, scanfunc_cnt, 1, agcnts);
} else {
do_warn(_("bad agbno %u for btbcnt root, agno %d\n"),
bno, agno);
}
- if (be32_to_cpu(agf->agf_freeblks) != agffreeblks) {
+ if (be32_to_cpu(agf->agf_freeblks) != agcnts->agffreeblks) {
do_warn(_("agf_freeblks %u, counted %u in ag %u\n"),
- be32_to_cpu(agf->agf_freeblks), agffreeblks, agno);
+ be32_to_cpu(agf->agf_freeblks), agcnts->agffreeblks, agno);
}
- if (be32_to_cpu(agf->agf_longest) != agflongest) {
+ if (be32_to_cpu(agf->agf_longest) != agcnts->agflongest) {
do_warn(_("agf_longest %u, counted %u in ag %u\n"),
- be32_to_cpu(agf->agf_longest), agflongest, agno);
+ be32_to_cpu(agf->agf_longest), agcnts->agflongest, agno);
}
if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
- be32_to_cpu(agf->agf_btreeblks) != agfbtreeblks) {
+ be32_to_cpu(agf->agf_btreeblks) != agcnts->agfbtreeblks) {
do_warn(_("agf_btreeblks %u, counted %u in ag %u\n"),
- be32_to_cpu(agf->agf_btreeblks), agfbtreeblks, agno);
+ be32_to_cpu(agf->agf_btreeblks), agcnts->agfbtreeblks, agno);
}
}
static void
validate_agi(
struct xfs_agi *agi,
- xfs_agnumber_t agno)
+ xfs_agnumber_t agno,
+ struct aghdr_cnts *agcnts)
{
xfs_agblock_t bno;
int i;
@@ -1126,20 +1140,20 @@ validate_agi(
bno = be32_to_cpu(agi->agi_root);
if (bno != 0 && verify_agbno(mp, agno, bno)) {
scan_sbtree(bno, be32_to_cpu(agi->agi_level),
- agno, 0, scanfunc_ino, 1);
+ agno, 0, scanfunc_ino, 1, agcnts);
} else {
do_warn(_("bad agbno %u for inobt root, agno %d\n"),
be32_to_cpu(agi->agi_root), agno);
}
- if (be32_to_cpu(agi->agi_count) != agicount) {
+ if (be32_to_cpu(agi->agi_count) != agcnts->agicount) {
do_warn(_("agi_count %u, counted %u in ag %u\n"),
- be32_to_cpu(agi->agi_count), agicount, agno);
+ be32_to_cpu(agi->agi_count), agcnts->agicount, agno);
}
- if (be32_to_cpu(agi->agi_freecount) != agifreecount) {
+ if (be32_to_cpu(agi->agi_freecount) != agcnts->agifreecount) {
do_warn(_("agi_freecount %u, counted %u in ag %u\n"),
- be32_to_cpu(agi->agi_freecount), agifreecount, agno);
+ be32_to_cpu(agi->agi_freecount), agcnts->agifreecount, agno);
}
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
@@ -1155,42 +1169,15 @@ validate_agi(
}
/*
- * Validate block/inode counts in the superblock.
- *
- * Note: needs to be called after scan_ag() has been called for all
- * allocation groups.
- */
-void
-validate_sb(
- struct xfs_sb *sb)
-{
- if (sb->sb_icount != icount) {
- do_warn(_("sb_icount %lld, counted %lld\n"),
- sb->sb_icount, icount);
- }
-
- if (sb->sb_ifree != ifreecount) {
- do_warn(_("sb_ifree %lld, counted %lld\n"),
- sb->sb_ifree, ifreecount);
- }
-
- if (sb->sb_fdblocks != fdblocks) {
- do_warn(_("sb_fdblocks %lld, counted %lld\n"),
- sb->sb_fdblocks, fdblocks);
- }
-
- /* XXX: check sb_frextents */
-}
-
-/*
* Scan an AG for obvious corruption.
*
* Note: This code is not reentrant due to the use of global variables.
*/
-void
-scan_ag(
- xfs_agnumber_t agno)
+void *
+scan_ag(void *args)
{
+ struct aghdr_cnts *agcnts = args;
+ xfs_agnumber_t agno = agcnts->agno;
xfs_agf_t *agf;
xfs_buf_t *agfbuf;
int agf_dirty = 0;
@@ -1202,28 +1189,18 @@ scan_ag(
int sb_dirty = 0;
int status;
- /*
- * Reset the global variables to track the AG header validity.
- *
- * Because we use global variable but can get called multiple times
- * we have to make sure to always reset these variables.
- */
- agicount = agifreecount = 0;
- agffreeblks = agfbtreeblks = 0;
- agflongest = 0;
-
sbbuf = libxfs_readbuf(mp->m_dev, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
XFS_FSS_TO_BB(mp, 1), 0);
if (!sbbuf) {
do_error(_("can't get root superblock for ag %d\n"), agno);
- return;
+ return NULL;
}
sb = (xfs_sb_t *)calloc(BBSIZE, 1);
if (!sb) {
do_error(_("can't allocate memory for superblock\n"));
libxfs_putbuf(sbbuf);
- return;
+ return NULL;
}
libxfs_sb_from_disk(sb, XFS_BUF_TO_SBP(sbbuf));
@@ -1234,7 +1211,7 @@ scan_ag(
do_error(_("can't read agf block for ag %d\n"), agno);
libxfs_putbuf(sbbuf);
free(sb);
- return;
+ return NULL;
}
agf = XFS_BUF_TO_AGF(agfbuf);
@@ -1246,7 +1223,7 @@ scan_ag(
libxfs_putbuf(agfbuf);
libxfs_putbuf(sbbuf);
free(sb);
- return;
+ return NULL;
}
agi = XFS_BUF_TO_AGI(agibuf);
@@ -1298,13 +1275,13 @@ scan_ag(
do_warn(_("bad uncorrected agheader %d, skipping ag...\n"),
agno);
- return;
+ return NULL;
}
- scan_freelist(agf);
+ scan_freelist(agf, agcnts);
- validate_agf(agf, agno);
- validate_agi(agi, agno);
+ validate_agf(agf, agno, agcnts);
+ validate_agi(agi, agno, agcnts);
ASSERT(agi_dirty == 0 || (agi_dirty && !no_modify));
@@ -1331,4 +1308,72 @@ scan_ag(
libxfs_putbuf(sbbuf);
free(sb);
PROG_RPT_INC(prog_rpt_done[agno], 1);
+
+#ifdef XR_INODE_TRACE
+ print_inode_list(i);
+#endif
+ return NULL;
+}
+
+#define SCAN_THREADS 32
+
+void
+scan_ags(
+ struct xfs_mount *mp)
+{
+ struct aghdr_cnts agcnts[mp->m_sb.sb_agcount];
+ pthread_t thr[SCAN_THREADS];
+ __uint64_t fdblocks = 0;
+ __uint64_t icount = 0;
+ __uint64_t ifreecount = 0;
+ int i, j, err;
+
+ /*
+ * scan a few AGs in parallel. The scan is IO latency bound,
+ * so running a few at a time will speed it up significantly.
+ */
+ for (i = 0; i < mp->m_sb.sb_agcount; i += SCAN_THREADS) {
+ for (j = 0; j < SCAN_THREADS; j++) {
+ if (i + j >= mp->m_sb.sb_agcount)
+ break;
+ memset(&agcnts[i + j], 0, sizeof(agcnts[i]));
+ agcnts[i + j].agno = i + j;
+ err = pthread_create(&thr[j], NULL, scan_ag,
+ &agcnts[i + j]);
+ if (err)
+ do_abort(_("pthread_create failed in scan_ags\n"));
+ }
+ for (j = 0; j < SCAN_THREADS; j++) {
+ if (i + j >= mp->m_sb.sb_agcount)
+ break;
+ pthread_join(thr[j], NULL);
+ }
+ }
+
+ for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+ fdblocks += agcnts[i].fdblocks;
+ icount += agcnts[i].icount;
+ ifreecount += agcnts[i].ifreecount;
+ }
+
+ /*
+ * Validate that our manual counts match the superblock.
+ */
+ if (mp->m_sb.sb_icount != icount) {
+ do_warn(_("sb_icount %lld, counted %lld\n"),
+ mp->m_sb.sb_icount, icount);
+ }
+
+ if (mp->m_sb.sb_ifree != ifreecount) {
+ do_warn(_("sb_ifree %lld, counted %lld\n"),
+ mp->m_sb.sb_ifree, ifreecount);
+ }
+
+ if (mp->m_sb.sb_fdblocks != fdblocks) {
+ do_warn(_("sb_fdblocks %lld, counted %lld\n"),
+ mp->m_sb.sb_fdblocks, fdblocks);
+ }
+
+ /* XXX: check sb_frextents */
}
+
diff --git a/repair/scan.h b/repair/scan.h
index 20567fb..4de1cef 100644
--- a/repair/scan.h
+++ b/repair/scan.h
@@ -20,19 +20,6 @@
struct blkmap;
-void scan_sbtree(
- xfs_agblock_t root,
- int nlevels,
- xfs_agnumber_t agno,
- int suspect,
- void (*func)(struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot),
- int isroot);
-
int scan_lbtree(
xfs_dfsbno_t root,
int nlevels,
@@ -74,29 +61,5 @@ int scanfunc_bmap(
int check_dups,
int *dirty);
-void scanfunc_bno(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot);
-
-void scanfunc_cnt(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot);
-
-void
-scanfunc_ino(
- struct xfs_btree_block *block,
- int level,
- xfs_agblock_t bno,
- xfs_agnumber_t agno,
- int suspect,
- int isroot);
#endif /* _XR_SCAN_H */
--
1.7.2.3
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-04 6:13 Dave Chinner
@ 2011-01-04 10:02 ` Christoph Hellwig
2011-01-04 12:00 ` Dave Chinner
2011-01-05 23:42 ` Alex Elder
1 sibling, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2011-01-04 10:02 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
> This patch uses 32-way threading which results in no noticable
> slowdown on single SATA drives with NCQ, but results in ~10x
> reduction in runtime on a 12 disk RAID-0 array.
Shouldn't we have at least an option to allow tuning this value,
similar to the ag_stride? In fact I wonder why phase 3/4 should
use different values for it than phase2.
> @@ -75,8 +80,10 @@ scan_sbtree(
> xfs_agblock_t bno,
> xfs_agnumber_t agno,
> int suspect,
> - int isroot),
> - int isroot)
> + int isroot,
> + struct aghdr_cnts *agcnts),
> + int isroot,
> + struct aghdr_cnts *agcnts)
Please make this a
void *priv
to keep scan_sbtree generic.
> void
> +scanfunc_bno(
> + struct xfs_btree_block *block,
> + int level,
> + xfs_agblock_t bno,
> + xfs_agnumber_t agno,
> + int suspect,
> + int isroot,
> + struct aghdr_cnts *agcnts)
> +{
> + return scanfunc_allocbt(block, level, bno, agno,
> + suspect, isroot, XFS_ABTB_MAGIC, agcnts);
> +}
Now that we have private data bassed to the scanfuncs we could use that
to communicate if we're doing a bno or cnt scan. Maybe writing it
directly into struct aghdr_cnts is too ugly, in that case we can have
a scan_priv structure that contains the magic and the aghdr_cnts.
>
> void
> scan_freelist(
This could become static.
> * Scan an AG for obvious corruption.
> *
> * Note: This code is not reentrant due to the use of global variables.
That's not true any more I think.
> */
> -void
> -scan_ag(
> - xfs_agnumber_t agno)
> +void *
> +scan_ag(void *args)
Can be static.
> +#define SCAN_THREADS 32
> +
> +void
> +scan_ags(
> + struct xfs_mount *mp)
> +{
> + struct aghdr_cnts agcnts[mp->m_sb.sb_agcount];
> + pthread_t thr[SCAN_THREADS];
> + __uint64_t fdblocks = 0;
> + __uint64_t icount = 0;
> + __uint64_t ifreecount = 0;
> + int i, j, err;
> +
> + /*
> + * scan a few AGs in parallel. The scan is IO latency bound,
> + * so running a few at a time will speed it up significantly.
> + */
> + for (i = 0; i < mp->m_sb.sb_agcount; i += SCAN_THREADS) {
I think this should use the workqueues from repair/threads.c. Just
create a workqueue with 32 threads, and then enqueue all the AGs.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-04 10:02 ` Christoph Hellwig
@ 2011-01-04 12:00 ` Dave Chinner
0 siblings, 0 replies; 13+ messages in thread
From: Dave Chinner @ 2011-01-04 12:00 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: xfs
On Tue, Jan 04, 2011 at 05:02:40AM -0500, Christoph Hellwig wrote:
> > This patch uses 32-way threading which results in no noticable
> > slowdown on single SATA drives with NCQ, but results in ~10x
> > reduction in runtime on a 12 disk RAID-0 array.
>
> Shouldn't we have at least an option to allow tuning this value,
> similar to the ag_stride? In fact I wonder why phase 3/4 should
> use different values for it than phase2.
Phase 3/4/5 use agressive prefetch to try to maximise throughput,
while phase 2 has no prefetch and uses synchronous reads.
Effectively the use of lots of parallelism simply keeps multiple IOs
in flight rather than reading them one at a time, hence reducing the
effective IO latency.
>
> > @@ -75,8 +80,10 @@ scan_sbtree(
> > xfs_agblock_t bno,
> > xfs_agnumber_t agno,
> > int suspect,
> > - int isroot),
> > - int isroot)
> > + int isroot,
> > + struct aghdr_cnts *agcnts),
> > + int isroot,
> > + struct aghdr_cnts *agcnts)
>
> Please make this a
>
> void *priv
>
> to keep scan_sbtree generic.
OK.
> > * Scan an AG for obvious corruption.
> > *
> > * Note: This code is not reentrant due to the use of global variables.
>
> That's not true any more I think.
Good point.
> > +#define SCAN_THREADS 32
> > +
> > +void
> > +scan_ags(
> > + struct xfs_mount *mp)
> > +{
> > + struct aghdr_cnts agcnts[mp->m_sb.sb_agcount];
> > + pthread_t thr[SCAN_THREADS];
> > + __uint64_t fdblocks = 0;
> > + __uint64_t icount = 0;
> > + __uint64_t ifreecount = 0;
> > + int i, j, err;
> > +
> > + /*
> > + * scan a few AGs in parallel. The scan is IO latency bound,
> > + * so running a few at a time will speed it up significantly.
> > + */
> > + for (i = 0; i < mp->m_sb.sb_agcount; i += SCAN_THREADS) {
>
> I think this should use the workqueues from repair/threads.c. Just
> create a workqueue with 32 threads, and then enqueue all the AGs.
Ok. I just used an API I'm familiar with and didn't have to think
about.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] xfs_repair: multithread phase 2
2011-01-04 6:13 Dave Chinner
2011-01-04 10:02 ` Christoph Hellwig
@ 2011-01-05 23:42 ` Alex Elder
1 sibling, 0 replies; 13+ messages in thread
From: Alex Elder @ 2011-01-05 23:42 UTC (permalink / raw)
To: Dave Chinner; +Cc: xfs
On Tue, 2011-01-04 at 17:13 +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Running some recent repair tests on broken filesystem meant running
> phase 1 and 2 repeatedly to reproduce an issue at the start of phase
> 3. Phase 2 was taking approximately 10 minutes to run as it
> processes each AG serially.
>
> Phase 2 can be trivially parallelised - it is simply scanning the
> per AG trees to calculate free block counts and free and used inodes
> counts. This can be done safely in parallel by giving each AG it's
> own structure to aggregate counts into, then once the AG scan is
> complete adding them all together.
>
> This patch uses 32-way threading which results in no noticable
> slowdown on single SATA drives with NCQ, but results in ~10x
> reduction in runtime on a 12 disk RAID-0 array.
This is great. And evidently not very hard at all. It should
have been done a long time ago...
I had a few of the same comments Christoph had (though
I didn't know about the the workqueues). I'll reiterate
one, that SCAN_THREADS should be a command line option.
32 is a fine default, but there's no sense in restricting
it to that.
A few other things, below, but this looks good to me.
Reviewed-by: Alex Elder <aelder@sgi.com>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> repair/phase2.c | 16 +---
> repair/scan.c | 303 +++++++++++++++++++++++++++++++-----------------------
> repair/scan.h | 37 -------
> 3 files changed, 176 insertions(+), 180 deletions(-)
. . .
> diff --git a/repair/scan.c b/repair/scan.c
> index 85017ff..dd62776 100644
> --- a/repair/scan.c
> +++ b/repair/scan.c
. . .
> @@ -469,6 +477,34 @@ _("out-of-order bmap key (file offset) in inode %llu, %s fork, fsbno %llu\n"),
> }
>
Can this (and scanfunc_cnt() and scanfunc_ino()) be given
static scope now?
> void
> +scanfunc_bno(
> + struct xfs_btree_block *block,
> + int level,
> + xfs_agblock_t bno,
> + xfs_agnumber_t agno,
> + int suspect,
> + int isroot,
> + struct aghdr_cnts *agcnts)
> +{
> + return scanfunc_allocbt(block, level, bno, agno,
> + suspect, isroot, XFS_ABTB_MAGIC, agcnts);
> +}
> +
. . .
> @@ -1155,42 +1169,15 @@ validate_agi(
> }
. . .
> -void
> -scan_ag(
> - xfs_agnumber_t agno)
> +void *
> +scan_ag(void *args)
Maybe arg (singular)
> {
> + struct aghdr_cnts *agcnts = args;
> + xfs_agnumber_t agno = agcnts->agno;
> xfs_agf_t *agf;
> xfs_buf_t *agfbuf;
> int agf_dirty = 0;
. . .
> @@ -1331,4 +1308,72 @@ scan_ag(
> libxfs_putbuf(sbbuf);
> free(sb);
> PROG_RPT_INC(prog_rpt_done[agno], 1);
> +
> +#ifdef XR_INODE_TRACE
> + print_inode_list(i);
I know this is only under XR_INODE_TRACE, but
now that you're multi-threading these, the
output can get interleaved and therefore
somewhat useless. Maybe you could adjust
print_inode_list() so it includes the AG
number with each line output rather than
just prior to printing all of them.
> +#endif
> + return NULL;
> +}
> +
> +#define SCAN_THREADS 32
Make this configurable at runtime.
> +
> +void
> +scan_ags(
> + struct xfs_mount *mp)
> +{
> + struct aghdr_cnts agcnts[mp->m_sb.sb_agcount];
There is some mention about the per-thread stack size
getting set at the time the program starts in the pthread
documentation. I don't expect this will be a problem in
practice, but maybe this should be allocated dynamically.
> + pthread_t thr[SCAN_THREADS];
> + __uint64_t fdblocks = 0;
> + __uint64_t icount = 0;
> + __uint64_t ifreecount = 0;
> + int i, j, err;
> +
> + /*
> + * scan a few AGs in parallel. The scan is IO latency bound,
> + * so running a few at a time will speed it up significantly.
> + */
> + for (i = 0; i < mp->m_sb.sb_agcount; i += SCAN_THREADS) {
> + for (j = 0; j < SCAN_THREADS; j++) {
xfs_agnumber_t agno = i + j;
> + if (i + j >= mp->m_sb.sb_agcount)
if (agno >= mp->m_sb.sg_agcount)
(and so on, throughout this section)
> + break;
> + memset(&agcnts[i + j], 0, sizeof(agcnts[i]));
agcnts[i + j]
> + agcnts[i + j].agno = i + j;
> + err = pthread_create(&thr[j], NULL, scan_ag,
> + &agcnts[i + j]);
> + if (err)
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2011-02-01 23:36 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-01-10 0:44 [PATCH] xfs_repair: multithread phase 2 Dave Chinner
2011-01-10 7:57 ` Michael Monnerie
2011-01-10 8:41 ` Dave Chinner
2011-01-10 13:25 ` Michael Monnerie
2011-01-10 19:18 ` Christoph Hellwig
2011-01-10 19:17 ` Christoph Hellwig
2011-01-10 21:53 ` Matthias Schniedermeyer
2011-01-10 18:55 ` Christoph Hellwig
2011-02-01 23:39 ` Alex Elder
-- strict thread matches above, loose matches on Subject: below --
2011-01-04 6:13 Dave Chinner
2011-01-04 10:02 ` Christoph Hellwig
2011-01-04 12:00 ` Dave Chinner
2011-01-05 23:42 ` Alex Elder
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox