* [Cluster-devel] fsck: Add readahead to inodes in pass1
@ 2013-02-19 10:12 Steven Whitehouse
0 siblings, 0 replies; only message in thread
From: Steven Whitehouse @ 2013-02-19 10:12 UTC (permalink / raw)
To: cluster-devel.redhat.com
Pass 1 in fsck reads all the inodes, in disk block order based upon the
rgrp bitmap. This patch adds a function to create a list of blocks of a
certain type from the rgrp bitmap. Using this list it is then possible
to do readahead on the inode blocks.
This gives me a gain of around 25% improvement in speed of overall fsck
time for my 500G test filesystem.
It is also a nice cleanup, since it splits the pass1 function into more
easily understood components.
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index 540f2a9..4c67a83 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -14,6 +14,7 @@
#include <unistd.h>
#include <string.h>
#include <time.h>
+#include <fcntl.h>
#include <sys/ioctl.h>
#include <inttypes.h>
#include <libintl.h>
@@ -1547,6 +1548,145 @@ static int check_system_inodes(struct gfs2_sbd *sdp)
return 0;
}
+#define RA_WINDOW 64
+
+static unsigned pass1_readahead(struct gfs2_sbd *sdp, uint64_t *ibuf, unsigned n, unsigned cur_window)
+{
+ unsigned i;
+ unsigned thresh = (n < RA_WINDOW) ? n : RA_WINDOW;
+
+ for (i = cur_window; i < thresh; i++)
+ posix_fadvise(sdp->device_fd, ibuf[i]*sdp->bsize, sdp->bsize, POSIX_FADV_WILLNEED);
+
+ return i;
+}
+
+static int pass1_process_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, uint64_t *ibuf, unsigned n)
+{
+ struct gfs2_buffer_head *bh;
+ unsigned i;
+ unsigned ra_window = 0;
+ uint64_t block;
+
+ for (i = 0; i < n; i++) {
+ block = ibuf[i];
+
+ if (ra_window < RA_WINDOW/2)
+ ra_window = pass1_readahead(sdp, ibuf + i, n - i, ra_window);
+
+ /* skip gfs1 rindex indirect blocks */
+ if (sdp->gfs1 && blockfind(&gfs1_rindex_blks, block)) {
+ log_debug(_("Skipping rindex indir block "
+ "%lld (0x%llx)\n"),
+ (unsigned long long)block,
+ (unsigned long long)block);
+ continue;
+ }
+ warm_fuzzy_stuff(block);
+
+ if (fsck_abort) { /* if asked to abort */
+ gfs2_special_free(&gfs1_rindex_blks);
+ return FSCK_OK;
+ }
+ if (skip_this_pass) {
+ printf( _("Skipping pass 1 is not a good idea.\n"));
+ skip_this_pass = FALSE;
+ fflush(stdout);
+ }
+ if (fsck_system_inode(sdp, block)) {
+ log_debug(_("Already processed system inode "
+ "%lld (0x%llx)\n"),
+ (unsigned long long)block,
+ (unsigned long long)block);
+ continue;
+ }
+ bh = bread(sdp, block);
+
+ if (gfs2_check_meta(bh, GFS2_METATYPE_DI)) {
+ /* In gfs2, a bitmap mark of 2 means an inode,
+ but in gfs1 it means any metadata. So if
+ this is gfs1 and not an inode, it may be
+ okay. If it's non-dinode metadata, it will
+ be referenced by an inode, so we need to
+ skip it here and it will be sorted out
+ when the referencing inode is checked. */
+ if (sdp->gfs1) {
+ uint32_t check_magic;
+
+ check_magic = ((struct gfs2_meta_header *)
+ (bh->b_data))->mh_magic;
+ if (be32_to_cpu(check_magic) == GFS2_MAGIC) {
+ log_debug( _("Deferring GFS1 "
+ "metadata block #"
+ "%" PRIu64" (0x%"
+ PRIx64 ")\n"),
+ block, block);
+ brelse(bh);
+ continue;
+ }
+ }
+ log_err( _("Found invalid inode at block #"
+ "%llu (0x%llx)\n"),
+ (unsigned long long)block,
+ (unsigned long long)block);
+ if (gfs2_blockmap_set(bl, block, gfs2_block_free)) {
+ stack;
+ brelse(bh);
+ gfs2_special_free(&gfs1_rindex_blks);
+ return FSCK_ERROR;
+ }
+ check_n_fix_bitmap(sdp, block, gfs2_block_free);
+ } else if (handle_di(sdp, bh) < 0) {
+ stack;
+ brelse(bh);
+ gfs2_special_free(&gfs1_rindex_blks);
+ return FSCK_ERROR;
+ }
+ /* Ignore everything else - they should be hit by the
+ handle_di step. Don't check NONE either, because
+ check_meta passes everything if GFS2_METATYPE_NONE
+ is specified. Hopefully, other metadata types such
+ as indirect blocks will be handled when the inode
+ itself is processed, and if it's not, it should be
+ caught in pass5. */
+ brelse(bh);
+ }
+
+ return 0;
+}
+
+static int pass1_process_rgrp(struct gfs2_sbd *sdp, struct rgrp_tree *rgd)
+{
+ unsigned k, n, i;
+ uint64_t *ibuf = malloc(sdp->bsize * GFS2_NBBY * sizeof(uint64_t));
+ int ret;
+
+ for (k = 0; k < rgd->ri.ri_length; k++) {
+ n = lgfs2_bm_scan(rgd, k, ibuf, GFS2_BLKST_DINODE);
+
+ if (n) {
+ ret = pass1_process_bitmap(sdp, rgd, ibuf, n);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ For GFS1, we have to count the "free meta" blocks in the
+ resource group and mark them specially so we can count them
+ properly in pass5.
+ */
+ if (!sdp->gfs1)
+ continue;
+
+ n = lgfs2_bm_scan(rgd, k, ibuf, GFS2_BLKST_UNLINKED);
+ for (i = 0; i < n; i++)
+ gfs2_blockmap_set(bl, ibuf[i], gfs2_freemeta);
+ }
+
+ free(ibuf);
+ return 0;
+}
+
/**
* pass1 - walk through inodes and check inode state
*
@@ -1563,12 +1703,10 @@ static int check_system_inodes(struct gfs2_sbd *sdp)
int pass1(struct gfs2_sbd *sdp)
{
struct osi_node *n, *next = NULL;
- struct gfs2_buffer_head *bh;
- uint64_t block = 0;
struct rgrp_tree *rgd;
- int first;
uint64_t i;
uint64_t rg_count = 0;
+ int ret;
osi_list_init(&gfs1_rindex_blks.list);
@@ -1611,115 +1749,10 @@ int pass1(struct gfs2_sbd *sdp)
gfs2_meta_rgrp);*/
}
- first = 1;
+ ret = pass1_process_rgrp(sdp, rgd);
+ if (ret)
+ return ret;
- while (1) {
- /* "block" is relative to the entire file system */
- /* Get the next dinode in the file system, according
- to the bitmap. This should ONLY be dinodes unless
- it's GFS1, in which case it can be any metadata. */
- if (gfs2_next_rg_meta(rgd, &block, first))
- break;
- /* skip gfs1 rindex indirect blocks */
- if (sdp->gfs1 && blockfind(&gfs1_rindex_blks, block)) {
- log_debug(_("Skipping rindex indir block "
- "%lld (0x%llx)\n"),
- (unsigned long long)block,
- (unsigned long long)block);
- first = 0;
- continue;
- }
- warm_fuzzy_stuff(block);
-
- if (fsck_abort) { /* if asked to abort */
- gfs2_special_free(&gfs1_rindex_blks);
- return FSCK_OK;
- }
- if (skip_this_pass) {
- printf( _("Skipping pass 1 is not a good idea.\n"));
- skip_this_pass = FALSE;
- fflush(stdout);
- }
- if (fsck_system_inode(sdp, block)) {
- log_debug(_("Already processed system inode "
- "%lld (0x%llx)\n"),
- (unsigned long long)block,
- (unsigned long long)block);
- first = 0;
- continue;
- }
- bh = bread(sdp, block);
-
- /*log_debug( _("Checking metadata block #%" PRIu64
- " (0x%" PRIx64 ")\n"), block, block);*/
-
- if (gfs2_check_meta(bh, GFS2_METATYPE_DI)) {
- /* In gfs2, a bitmap mark of 2 means an inode,
- but in gfs1 it means any metadata. So if
- this is gfs1 and not an inode, it may be
- okay. If it's non-dinode metadata, it will
- be referenced by an inode, so we need to
- skip it here and it will be sorted out
- when the referencing inode is checked. */
- if (sdp->gfs1) {
- uint32_t check_magic;
-
- check_magic = ((struct
- gfs2_meta_header *)
- (bh->b_data))->mh_magic;
- if (be32_to_cpu(check_magic) ==
- GFS2_MAGIC) {
- log_debug( _("Deferring GFS1 "
- "metadata block #"
- "%" PRIu64" (0x%"
- PRIx64 ")\n"),
- block, block);
- brelse(bh);
- first = 0;
- continue;
- }
- }
- log_err( _("Found invalid inode at block #"
- "%llu (0x%llx)\n"),
- (unsigned long long)block,
- (unsigned long long)block);
- if (gfs2_blockmap_set(bl, block,
- gfs2_block_free)) {
- stack;
- brelse(bh);
- gfs2_special_free(&gfs1_rindex_blks);
- return FSCK_ERROR;
- }
- check_n_fix_bitmap(sdp, block,
- gfs2_block_free);
- } else if (handle_di(sdp, bh) < 0) {
- stack;
- brelse(bh);
- gfs2_special_free(&gfs1_rindex_blks);
- return FSCK_ERROR;
- }
- /* Ignore everything else - they should be hit by the
- handle_di step. Don't check NONE either, because
- check_meta passes everything if GFS2_METATYPE_NONE
- is specified. Hopefully, other metadata types such
- as indirect blocks will be handled when the inode
- itself is processed, and if it's not, it should be
- caught in pass5. */
- brelse(bh);
- first = 0;
- }
- /*
- For GFS1, we have to count the "free meta" blocks in the
- resource group and mark them specially so we can count them
- properly in pass5.
- */
- if (!sdp->gfs1)
- continue;
- first = 1;
- while (gfs2_next_rg_freemeta(rgd, &block, first) == 0) {
- gfs2_blockmap_set(bl, block, gfs2_freemeta);
- first = 0;
- }
}
gfs2_special_free(&gfs1_rindex_blks);
return FSCK_OK;
diff --git a/gfs2/libgfs2/libgfs2.h b/gfs2/libgfs2/libgfs2.h
index 46d4d67..db31a6c 100644
--- a/gfs2/libgfs2/libgfs2.h
+++ b/gfs2/libgfs2/libgfs2.h
@@ -757,12 +757,12 @@ extern int build_root(struct gfs2_sbd *sdp);
extern int do_init_inum(struct gfs2_sbd *sdp);
extern int do_init_statfs(struct gfs2_sbd *sdp);
extern int gfs2_check_meta(struct gfs2_buffer_head *bh, int type);
+extern unsigned lgfs2_bm_scan(struct rgrp_tree *rgd, unsigned idx,
+ uint64_t *buf, uint8_t state);
extern int gfs2_next_rg_meta(struct rgrp_tree *rgd, uint64_t *block,
int first);
extern int gfs2_next_rg_metatype(struct gfs2_sbd *sdp, struct rgrp_tree *rgd,
uint64_t *block, uint32_t type, int first);
-extern int gfs2_next_rg_freemeta(struct rgrp_tree *rgd, uint64_t *block,
- int first);
/* super.c */
extern int check_sb(struct gfs2_sb *sb);
diff --git a/gfs2/libgfs2/structures.c b/gfs2/libgfs2/structures.c
index 645c45a..6981072 100644
--- a/gfs2/libgfs2/structures.c
+++ b/gfs2/libgfs2/structures.c
@@ -495,6 +495,24 @@ int gfs2_check_meta(struct gfs2_buffer_head *bh, int type)
return 0;
}
+unsigned lgfs2_bm_scan(struct rgrp_tree *rgd, unsigned idx, uint64_t *buf, uint8_t state)
+{
+ struct gfs2_bitmap *bi = &rgd->bits[idx];
+ unsigned n = 0;
+ uint32_t blk = 0;
+
+ while(blk < (bi->bi_len * GFS2_NBBY)) {
+ blk = gfs2_bitfit(rgd->bh[idx]->b_data + bi->bi_offset,
+ bi->bi_len, blk, state);
+ if (blk == BFITNOENT)
+ break;
+ buf[n++] = blk + (bi->bi_start * GFS2_NBBY) + rgd->ri.ri_data0;
+ blk++;
+ }
+
+ return n;
+}
+
/**
* gfs2_next_rg_meta
* @rgd:
@@ -545,11 +563,6 @@ int gfs2_next_rg_meta(struct rgrp_tree *rgd, uint64_t *block, int first)
return __gfs2_next_rg_meta(rgd, block, first, GFS2_BLKST_DINODE);
}
-int gfs2_next_rg_freemeta(struct rgrp_tree *rgd, uint64_t *block, int first)
-{
- return __gfs2_next_rg_meta(rgd, block, first, GFS2_BLKST_UNLINKED);
-}
-
/**
* next_rg_metatype
* @rgd:
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2013-02-19 10:12 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-02-19 10:12 [Cluster-devel] fsck: Add readahead to inodes in pass1 Steven Whitehouse
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).