From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: tytso@mit.edu, darrick.wong@oracle.com
Cc: linux-ext4@vger.kernel.org
Subject: [PATCH 37/49] e2fsck: read-ahead metadata during passes 1, 2, and 4
Date: Mon, 10 Mar 2014 23:57:57 -0700 [thread overview]
Message-ID: <20140311065757.30585.46729.stgit@birch.djwong.org> (raw)
In-Reply-To: <20140311065356.30585.47192.stgit@birch.djwong.org>
e2fsck pass1 is modified to use the block group data prefetch function
to try to fetch the inode tables into the pagecache before it is
needed. In order to avoid cache thrashing, we limit ourselves to
prefetching at most half the available memory.
pass2 is modified to use the dirblock prefetching function to prefetch
the list of directory blocks that are assembled in pass1. So long as
we don't anticipate rehashing the dirs (pass 3a), we can release the
dirblocks as soon as we're done checking them.
pass4 is modified to prefetch the block and inode bitmaps in
anticipation of pass 5, because pass4 is entirely CPU bound.
In general, these mechanisms can halve fsck time, if the host system
has sufficient memory and the storage system can provide a lot of
IOPs. SSDs and multi-spindle RAIDs see the most speedup; single disks
experience a modest speedup, and single-spindle USB mass storage
devices see hardly any benefit.
By default, readahead will try to fill half the physical memory in the
system. The -R option can be given to specify the amount of memory to
use for readahead, or zero to disable it entirely; or an option can be
given in e2fsck.conf.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
MCONFIG.in | 1
configure | 49 +++++++++++++++++
configure.in | 6 ++
e2fsck/Makefile.in | 4 +
e2fsck/e2fsck.8.in | 9 +++
e2fsck/e2fsck.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++
e2fsck/e2fsck.conf.5.in | 13 ++++
e2fsck/e2fsck.h | 25 +++++++++
e2fsck/pass1.c | 83 +++++++++++++++++++++++++++++
e2fsck/pass2.c | 96 +++++++++++++++++++++++++++++++++
e2fsck/pass4.c | 22 ++++++++
e2fsck/prof_err.et | 1
e2fsck/rehash.c | 10 +++
e2fsck/unix.c | 35 +++++++++++-
e2fsck/util.c | 51 ++++++++++++++++++
lib/config.h.in | 9 +++
16 files changed, 544 insertions(+), 6 deletions(-)
diff --git a/MCONFIG.in b/MCONFIG.in
index 9b411d6..6ee88db 100644
--- a/MCONFIG.in
+++ b/MCONFIG.in
@@ -116,6 +116,7 @@ LIBUUID = @LIBUUID@ @SOCKET_LIB@
LIBQUOTA = @STATIC_LIBQUOTA@
LIBBLKID = @LIBBLKID@ @PRIVATE_LIBS_CMT@ $(LIBUUID)
LIBINTL = @LIBINTL@
+LIBPTHREADS = @PTHREADS_LIB@
SYSLIBS = @LIBS@
DEPLIBSS = $(LIB)/libss@LIB_EXT@
DEPLIBCOM_ERR = $(LIB)/libcom_err@LIB_EXT@
diff --git a/configure b/configure
index 7b0a0d1..5b89229 100755
--- a/configure
+++ b/configure
@@ -639,6 +639,7 @@ CYGWIN_CMT
LINUX_CMT
UNI_DIFF_OPTS
SEM_INIT_LIB
+PTHREADS_LIB
SOCKET_LIB
SIZEOF_OFF_T
SIZEOF_LONG_LONG
@@ -10474,7 +10475,7 @@ fi
done
fi
-for ac_header in dirent.h errno.h execinfo.h getopt.h malloc.h mntent.h paths.h semaphore.h setjmp.h signal.h stdarg.h stdint.h stdlib.h termios.h termio.h unistd.h utime.h linux/falloc.h linux/fd.h linux/major.h linux/loop.h net/if_dl.h netinet/in.h sys/disklabel.h sys/file.h sys/ioctl.h sys/mkdev.h sys/mman.h sys/prctl.h sys/queue.h sys/resource.h sys/select.h sys/socket.h sys/sockio.h sys/stat.h sys/syscall.h sys/sysmacros.h sys/time.h sys/types.h sys/un.h sys/wait.h
+for ac_header in dirent.h errno.h execinfo.h getopt.h malloc.h mntent.h paths.h semaphore.h setjmp.h signal.h stdarg.h stdint.h stdlib.h termios.h termio.h unistd.h utime.h linux/falloc.h linux/fd.h linux/major.h linux/loop.h net/if_dl.h netinet/in.h sys/disklabel.h sys/file.h sys/ioctl.h sys/mkdev.h sys/mman.h sys/prctl.h sys/queue.h sys/resource.h sys/select.h sys/socket.h sys/sockio.h sys/stat.h sys/syscall.h sys/sysctl.h sys/sysmacros.h sys/time.h sys/types.h sys/un.h sys/wait.h
do :
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -11235,6 +11236,52 @@ if test $ac_cv_have_optreset = yes; then
$as_echo "#define HAVE_OPTRESET 1" >>confdefs.h
fi
+PTHREADS_LIB='-lpthread'
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
+$as_echo_n "checking for pthread_create in -lpthread... " >&6; }
+if ${ac_cv_lib_pthread_pthread_create+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-lpthread $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+/* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ ac_cv_lib_pthread_pthread_create=yes
+else
+ ac_cv_lib_pthread_pthread_create=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
+$as_echo "$ac_cv_lib_pthread_pthread_create" >&6; }
+if test "x$ac_cv_lib_pthread_pthread_create" = xyes; then :
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBPTHREAD 1
+_ACEOF
+
+ LIBS="-lpthread $LIBS"
+
+fi
+
SEM_INIT_LIB=''
ac_fn_c_check_func "$LINENO" "sem_init" "ac_cv_func_sem_init"
diff --git a/configure.in b/configure.in
index f28bd46..d2cfe41 100644
--- a/configure.in
+++ b/configure.in
@@ -961,6 +961,7 @@ AC_CHECK_HEADERS(m4_flatten([
sys/sockio.h
sys/stat.h
sys/syscall.h
+ sys/sysctl.h
sys/sysmacros.h
sys/time.h
sys/types.h
@@ -1173,6 +1174,11 @@ if test $ac_cv_have_optreset = yes; then
AC_DEFINE(HAVE_OPTRESET, 1, [Define to 1 if optreset for getopt is present])
fi
dnl
+dnl Test for pthread_create in -lpthread
+dnl
+PTHREADS_LIB='-lpthread'
+AC_CHECK_LIB(pthread, pthread_create, AC_SUBST(PTHREADS_LIB))
+dnl
dnl Test for sem_init, and which library it might require:
dnl
AH_TEMPLATE([HAVE_SEM_INIT], [Define to 1 if sem_init() exists])
diff --git a/e2fsck/Makefile.in b/e2fsck/Makefile.in
index 5c8ce39..7136f7f 100644
--- a/e2fsck/Makefile.in
+++ b/e2fsck/Makefile.in
@@ -16,13 +16,13 @@ MANPAGES= e2fsck.8
FMANPAGES= e2fsck.conf.5
LIBS= $(LIBQUOTA) $(LIBEXT2FS) $(LIBCOM_ERR) $(LIBBLKID) $(LIBUUID) \
- $(LIBINTL) $(LIBE2P) $(SYSLIBS)
+ $(LIBINTL) $(LIBE2P) $(SYSLIBS) $(LIBPTHREADS)
DEPLIBS= $(DEPLIBQUOTA) $(LIBEXT2FS) $(DEPLIBCOM_ERR) $(DEPLIBBLKID) \
$(DEPLIBUUID) $(DEPLIBE2P)
STATIC_LIBS= $(STATIC_LIBQUOTA) $(STATIC_LIBEXT2FS) $(STATIC_LIBCOM_ERR) \
$(STATIC_LIBBLKID) $(STATIC_LIBUUID) $(LIBINTL) $(STATIC_LIBE2P) \
- $(SYSLIBS)
+ $(SYSLIBS) $(LIBPTHEADS)
STATIC_DEPLIBS= $(DEPSTATIC_LIBQUOTA) $(STATIC_LIBEXT2FS) \
$(DEPSTATIC_LIBCOM_ERR) $(DEPSTATIC_LIBBLKID) \
$(DEPSTATIC_LIBUUID) $(DEPSTATIC_LIBE2P)
diff --git a/e2fsck/e2fsck.8.in b/e2fsck/e2fsck.8.in
index 43ee063..90eda4c 100644
--- a/e2fsck/e2fsck.8.in
+++ b/e2fsck/e2fsck.8.in
@@ -34,6 +34,10 @@ e2fsck \- check a Linux ext2/ext3/ext4 file system
.B \-E
.I extended_options
]
+[
+.B \-R
+.I readahead_mem_kb
+]
.I device
.SH DESCRIPTION
.B e2fsck
@@ -302,6 +306,11 @@ options.
This option does nothing at all; it is provided only for backwards
compatibility.
.TP
+.B \-R
+Use at most this many KiB to pre-fetch metadata in the hopes of reducing
+e2fsck runtime. By default, this uses half the physical memory in the
+system; setting this value to zero disables readahead entirely.
+.TP
.B \-t
Print timing statistics for
.BR e2fsck .
diff --git a/e2fsck/e2fsck.c b/e2fsck/e2fsck.c
index 0ec1540..c5d823c 100644
--- a/e2fsck/e2fsck.c
+++ b/e2fsck/e2fsck.c
@@ -15,6 +15,10 @@
#include "e2fsck.h"
#include "problem.h"
+#ifdef HAVE_PTHREAD_H
+#include <pthread.h>
+#endif
+
/*
* This function allocates an e2fsck context
*/
@@ -44,6 +48,8 @@ errcode_t e2fsck_allocate_context(e2fsck_t *ret)
context->flags |= E2F_FLAG_TIME_INSANE;
}
+ e2fsck_init_thread(&context->ra_thread);
+
*ret = context;
return 0;
}
@@ -209,6 +215,7 @@ int e2fsck_run(e2fsck_t ctx)
{
int i;
pass_t e2fsck_pass;
+ errcode_t err;
#ifdef HAVE_SETJMP_H
if (setjmp(ctx->abort_loc)) {
@@ -226,6 +233,10 @@ int e2fsck_run(e2fsck_t ctx)
e2fsck_pass(ctx);
if (ctx->progress)
(void) (ctx->progress)(ctx, 0, 0, 0);
+ err = e2fsck_stop_thread(&ctx->ra_thread, NULL);
+ if (err)
+ com_err(ctx->program_name, err, "%s",
+ _("while stopping readahead"));
}
ctx->flags &= ~E2F_FLAG_SETJMP_OK;
@@ -233,3 +244,128 @@ int e2fsck_run(e2fsck_t ctx)
return (ctx->flags & E2F_FLAG_RUN_RETURN);
return 0;
}
+
+#ifdef HAVE_PTHREAD_H
+struct run_threaded {
+ struct e2fsck_thread *thread;
+ void * (*func)(void *);
+ void (*cleanup)(void *);
+ void *arg;
+};
+
+static void run_threaded_cleanup(void *p)
+{
+ struct run_threaded *rt = p;
+
+ if (rt->cleanup)
+ rt->cleanup(rt->arg);
+ pthread_mutex_lock(&rt->thread->lock);
+ rt->thread->running = 0;
+ pthread_mutex_unlock(&rt->thread->lock);
+ ext2fs_free_mem(&rt);
+}
+
+static void *run_threaded_helper(void *p)
+{
+ int old;
+ struct run_threaded *rt = p;
+ void *ret;
+
+ pthread_cleanup_push(run_threaded_cleanup, rt);
+ pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+ ret = rt->func(rt->arg);
+ pthread_setcanceltype(old, NULL);
+ pthread_cleanup_pop(1);
+ pthread_exit(ret);
+ return NULL;
+}
+#endif /* HAVE_PTHREAD_H */
+
+errcode_t e2fsck_init_thread(struct e2fsck_thread *thread)
+{
+ errcode_t err = 0;
+
+ thread->magic = E2FSCK_ET_MAGIC_RUN_THREAD;
+#ifdef HAVE_PTHREAD_H
+ err = pthread_mutex_init(&thread->lock, NULL);
+#endif /* HAVE_PTHREAD_H */
+
+ return err;
+}
+
+errcode_t e2fsck_run_thread(struct e2fsck_thread *thread,
+ void * (*func)(void *), void (*cleanup)(void *),
+ void *arg)
+{
+#ifdef HAVE_PTHREAD_H
+ struct run_threaded *rt;
+#endif
+ errcode_t err = 0, err2;
+
+ EXT2_CHECK_MAGIC(thread, E2FSCK_ET_MAGIC_RUN_THREAD);
+#ifdef HAVE_PTHREAD_H
+ err = pthread_mutex_lock(&thread->lock);
+ if (err)
+ return err;
+
+ if (thread->running) {
+ err = EAGAIN;
+ goto out;
+ }
+
+ err = pthread_join(thread->tid, NULL);
+ if (err && err != ESRCH)
+ goto out;
+
+ err = ext2fs_get_mem(sizeof(*rt), &rt);
+ if (err)
+ goto out;
+
+ rt->thread = thread;
+ rt->func = func;
+ rt->cleanup = cleanup;
+ rt->arg = arg;
+
+ err = pthread_create(&thread->tid, NULL, run_threaded_helper, rt);
+ if (err)
+ ext2fs_free_mem(&rt);
+ else
+ thread->running = 1;
+out:
+ pthread_mutex_unlock(&thread->lock);
+#else
+ thread->ret = func(arg);
+ if (cleanup)
+ cleanup(arg);
+#endif /* HAVE_PTHREAD_H */
+
+ return err;
+}
+
+errcode_t e2fsck_stop_thread(struct e2fsck_thread *thread, void **ret)
+{
+ errcode_t err = 0, err2;
+
+ EXT2_CHECK_MAGIC(thread, E2FSCK_ET_MAGIC_RUN_THREAD);
+
+#ifdef HAVE_PTHREAD_H
+ err = pthread_mutex_lock(&thread->lock);
+ if (err)
+ return err;
+ if (thread->running)
+ err = pthread_cancel(thread->tid);
+ if (err == ESRCH)
+ err = 0;
+ err2 = pthread_mutex_unlock(&thread->lock);
+ if (!err && err2)
+ err = err2;
+ if (!err)
+ err = pthread_join(thread->tid, ret);
+ if (err == ESRCH)
+ err = 0;
+#else
+ if (ret)
+ *ret = thread->ret;
+#endif
+ return err;
+}
diff --git a/e2fsck/e2fsck.conf.5.in b/e2fsck/e2fsck.conf.5.in
index a8219a8..fcda392 100644
--- a/e2fsck/e2fsck.conf.5.in
+++ b/e2fsck/e2fsck.conf.5.in
@@ -205,6 +205,19 @@ of that type are squelched. This can be useful if the console is slow
(i.e., connected to a serial port) and so a large amount of output could
end up delaying the boot process for a long time (potentially hours).
.TP
+.I readahead_mem_pct
+Use no more than this percentage of memory to try to read in metadata blocks
+ahead of the main e2fsck thread. This should reduce run times, depending on
+the speed of the underlying storage and the amount of free memory. By default,
+this is set to 50%.
+.TP
+.I readahead_mem_kb
+Use no more than this amount of memory to read in metadata blocks ahead of the
+main checking thread. Setting this value to zero disables readahead entirely.
+There is no default, but see
+.B readahead_mem_pct
+for more details.
+.TP
.I report_features
If this boolean relation is true, e2fsck will print the file system
features as part of its verbose reporting (i.e., if the
diff --git a/e2fsck/e2fsck.h b/e2fsck/e2fsck.h
index d7a7be9..8ceeff9 100644
--- a/e2fsck/e2fsck.h
+++ b/e2fsck/e2fsck.h
@@ -11,6 +11,7 @@
#include <stdio.h>
#include <string.h>
+#include <stdint.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
@@ -69,6 +70,24 @@
#include "quota/mkquota.h"
+/* Functions to run something asynchronously */
+struct e2fsck_thread {
+ int magic;
+#ifdef HAVE_PTHREAD_H
+ int running;
+ pthread_t tid;
+ pthread_mutex_t lock;
+#else
+ void *ret;
+#endif /* HAVE_PTHREAD_T */
+};
+
+errcode_t e2fsck_init_thread(struct e2fsck_thread *thread);
+errcode_t e2fsck_run_thread(struct e2fsck_thread *thread,
+ void * (*func)(void *), void (*cleanup)(void *),
+ void *arg);
+errcode_t e2fsck_stop_thread(struct e2fsck_thread *thread, void **ret);
+
/*
* Exit codes used by fsck-type programs
*/
@@ -373,6 +392,10 @@ struct e2fsck_struct {
* e2fsck functions themselves.
*/
void *priv_data;
+
+ /* How much are we allowed to readahead? */
+ unsigned long long readahead_mem_kb;
+ struct e2fsck_thread ra_thread;
};
/* Used by the region allocation code */
@@ -495,6 +518,7 @@ void e2fsck_rehash_dir_later(e2fsck_t ctx, ext2_ino_t ino);
int e2fsck_dir_will_be_rehashed(e2fsck_t ctx, ext2_ino_t ino);
errcode_t e2fsck_rehash_dir(e2fsck_t ctx, ext2_ino_t ino);
void e2fsck_rehash_directories(e2fsck_t ctx);
+int e2fsck_will_rehash_dirs(e2fsck_t ctx);
/* sigcatcher.c */
void sigcatcher_setup(void);
@@ -573,6 +597,7 @@ extern errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs,
int default_type,
const char *profile_name,
ext2fs_block_bitmap *ret);
+int64_t get_memory_size(void);
/* unix.c */
extern void e2fsck_clear_progbar(e2fsck_t ctx);
diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index eb9497c..a6d3297 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -589,6 +589,49 @@ static errcode_t recheck_bad_inode_checksum(ext2_filsys fs, ext2_ino_t ino,
return 0;
}
+struct pass1ra_ctx {
+ ext2_filsys fs;
+ dgrp_t group;
+ dgrp_t ngroups;
+};
+
+static void pass1_readahead_cleanup(void *p)
+{
+ struct pass1ra_ctx *c = p;
+
+ ext2fs_free_mem(&p);
+}
+
+static void *pass1_readahead(void *p)
+{
+ struct pass1ra_ctx *c = p;
+ errcode_t err;
+
+ ext2fs_readahead(c->fs, EXT2_READA_ITABLE, c->group, c->ngroups);
+ return NULL;
+}
+
+static errcode_t initiate_readahead(e2fsck_t ctx, dgrp_t group, dgrp_t ngroups)
+{
+ struct pass1ra_ctx *ractx;
+ errcode_t err;
+
+ err = ext2fs_get_mem(sizeof(*ractx), &ractx);
+ if (err)
+ return err;
+
+ ractx->fs = ctx->fs;
+ ractx->group = group;
+ ractx->ngroups = ngroups;
+
+ err = e2fsck_run_thread(&ctx->ra_thread, pass1_readahead,
+ pass1_readahead_cleanup, ractx);
+ if (err)
+ ext2fs_free_mem(&ractx);
+
+ return err;
+}
+
void e2fsck_pass1(e2fsck_t ctx)
{
int i;
@@ -611,10 +654,37 @@ void e2fsck_pass1(e2fsck_t ctx)
int busted_fs_time = 0;
int inode_size;
int failed_csum = 0;
+ dgrp_t grp;
+ ext2_ino_t ra_threshold = 0;
+ dgrp_t ra_groups = 0;
+ errcode_t err;
init_resource_track(&rtrack, ctx->fs->io);
clear_problem_context(&pctx);
+ /* If we can do readahead, figure out how many groups to pull in. */
+ if (!ext2fs_can_readahead(ctx->fs))
+ ctx->readahead_mem_kb = 0;
+ if (ctx->readahead_mem_kb) {
+ ra_groups = ctx->readahead_mem_kb /
+ (fs->inode_blocks_per_group * fs->blocksize /
+ 1024);
+ if (ra_groups < 16)
+ ra_groups = 0;
+ else if (ra_groups > fs->group_desc_count)
+ ra_groups = fs->group_desc_count;
+ if (ra_groups) {
+ err = initiate_readahead(ctx, grp, ra_groups);
+ if (err) {
+ com_err(ctx->program_name, err, "%s",
+ _("while starting pass1 readahead"));
+ ra_groups = 0;
+ }
+ ra_threshold = ra_groups *
+ fs->super->s_inodes_per_group;
+ }
+ }
+
if (!(ctx->options & E2F_OPT_PREEN))
fix_problem(ctx, PR_1_PASS_HEADER, &pctx);
@@ -778,6 +848,19 @@ void e2fsck_pass1(e2fsck_t ctx)
if (e2fsck_mmp_update(fs))
fatal_error(ctx, 0);
}
+ if (ra_groups && ino > ra_threshold) {
+ grp = (ino - 1) / fs->super->s_inodes_per_group;
+ ra_threshold = (grp + ra_groups) *
+ fs->super->s_inodes_per_group;
+ err = initiate_readahead(ctx, grp, ra_groups);
+ if (err == EAGAIN) {
+ printf("Disabling slow readahead.\n");
+ ra_groups = 0;
+ } else if (err) {
+ com_err(ctx->program_name, err, "%s",
+ _("while starting pass1 readahead"));
+ }
+ }
old_op = ehandler_operation(_("getting next inode from scan"));
pctx.errcode = ext2fs_get_next_inode_full(scan, &ino,
inode, inode_size);
diff --git a/e2fsck/pass2.c b/e2fsck/pass2.c
index 99b4042..292db82 100644
--- a/e2fsck/pass2.c
+++ b/e2fsck/pass2.c
@@ -61,6 +61,9 @@
* Keeps track of how many times an inode is referenced.
*/
static void deallocate_inode(e2fsck_t ctx, ext2_ino_t ino, char* block_buf);
+static int check_dir_block2(ext2_filsys fs,
+ struct ext2_db_entry2 *dir_blocks_info,
+ void *priv_data);
static int check_dir_block(ext2_filsys fs,
struct ext2_db_entry2 *dir_blocks_info,
void *priv_data);
@@ -77,8 +80,67 @@ struct check_dir_struct {
struct problem_context pctx;
int count, max;
e2fsck_t ctx;
+ int save_readahead;
+};
+
+struct pass2_readahead_data {
+ ext2_filsys fs;
+ ext2_dblist dblist;
};
+static int readahead_dir_block(ext2_filsys fs, struct ext2_db_entry2 *db,
+ void *priv_data)
+{
+ db->blockcnt = 1;
+ return 0;
+}
+
+static void pass2_readahead_cleanup(void *p)
+{
+ struct pass2_readahead_data *pr = p;
+
+ ext2fs_free_dblist(pr->dblist);
+ ext2fs_free_mem(&pr);
+}
+
+static void *pass2_readahead(void *p)
+{
+ struct pass2_readahead_data *pr = p;
+
+ ext2fs_readahead_dblist(pr->fs, 0, pr->dblist);
+ return NULL;
+}
+
+static errcode_t initiate_readahead(e2fsck_t ctx)
+{
+ struct pass2_readahead_data *pr;
+ errcode_t err;
+
+ err = ext2fs_get_mem(sizeof(*pr), &pr);
+ if (err)
+ return err;
+ pr->fs = ctx->fs;
+ err = ext2fs_copy_dblist(ctx->fs->dblist, &pr->dblist);
+ if (err)
+ goto out_pr;
+ err = ext2fs_dblist_iterate2(pr->dblist, readahead_dir_block,
+ NULL);
+ if (err)
+ goto out_dblist;
+ err = e2fsck_run_thread(&ctx->ra_thread, pass2_readahead,
+ pass2_readahead_cleanup, pr);
+ if (err)
+ goto out_dblist;
+
+ return 0;
+
+out_dblist:
+ ext2fs_free_dblist(pr->dblist);
+out_pr:
+ ext2fs_free_mem(&pr);
+ return err;
+}
+
void e2fsck_pass2(e2fsck_t ctx)
{
struct ext2_super_block *sb = ctx->fs->super;
@@ -96,6 +158,10 @@ void e2fsck_pass2(e2fsck_t ctx)
int i, depth;
problem_t code;
int bad_dir;
+ int (*check_dir_func)(ext2_filsys fs,
+ struct ext2_db_entry2 *dir_blocks_info,
+ void *priv_data);
+ errcode_t err;
init_resource_track(&rtrack, ctx->fs->io);
clear_problem_context(&cd.pctx);
@@ -139,6 +205,7 @@ void e2fsck_pass2(e2fsck_t ctx)
cd.ctx = ctx;
cd.count = 1;
cd.max = ext2fs_dblist_count2(fs->dblist);
+ cd.save_readahead = e2fsck_will_rehash_dirs(ctx);
if (ctx->progress)
(void) (ctx->progress)(ctx, 2, 0, cd.max);
@@ -146,7 +213,16 @@ void e2fsck_pass2(e2fsck_t ctx)
if (fs->super->s_feature_compat & EXT2_FEATURE_COMPAT_DIR_INDEX)
ext2fs_dblist_sort2(fs->dblist, special_dir_block_cmp);
- cd.pctx.errcode = ext2fs_dblist_iterate2(fs->dblist, check_dir_block,
+ if (ctx->readahead_mem_kb) {
+ check_dir_func = check_dir_block2;
+ err = initiate_readahead(ctx);
+ if (err)
+ com_err(ctx->program_name, err, "%s",
+ _("while starting pass2 readahead"));
+ } else
+ check_dir_func = check_dir_block;
+
+ cd.pctx.errcode = ext2fs_dblist_iterate2(fs->dblist, check_dir_func,
&cd);
if (ctx->flags & E2F_FLAG_SIGNAL_MASK || ctx->flags & E2F_FLAG_RESTART)
return;
@@ -655,6 +731,7 @@ clear_and_exit:
clear_htree(cd->ctx, cd->pctx.ino);
dx_dir->numblocks = 0;
e2fsck_rehash_dir_later(cd->ctx, cd->pctx.ino);
+ cd->save_readahead = 1;
}
#endif /* ENABLE_HTREE */
@@ -774,6 +851,19 @@ static errcode_t insert_dirent_tail(ext2_filsys fs, void *dirbuf)
return 0;
}
+static int check_dir_block2(ext2_filsys fs,
+ struct ext2_db_entry2 *db,
+ void *priv_data)
+{
+ int err;
+ struct check_dir_struct *cd = priv_data;
+
+ err = check_dir_block(fs, db, priv_data);
+ if (!cd->save_readahead)
+ io_channel_cache_release(fs->io, db->blk, 1);
+ return err;
+}
+
static int check_dir_block(ext2_filsys fs,
struct ext2_db_entry2 *db,
void *priv_data)
@@ -957,6 +1047,7 @@ out_htree:
&cd->pctx))
goto skip_checksum;
e2fsck_rehash_dir_later(ctx, ino);
+ cd->save_readahead = 1;
goto skip_checksum;
}
if (failed_csum) {
@@ -1249,6 +1340,7 @@ skip_checksum:
pctx.dirent = dirent;
fix_problem(ctx, PR_2_REPORT_DUP_DIRENT, &pctx);
e2fsck_rehash_dir_later(ctx, ino);
+ cd->save_readahead = 1;
dups_found++;
} else
dict_alloc_insert(&de_dict, dirent, dirent);
@@ -1316,6 +1408,8 @@ skip_checksum:
if (insert_dirent_tail(fs, buf) == 0)
goto write_and_fix;
e2fsck_rehash_dir_later(ctx, ino);
+ cd->save_readahead = 1;
+ }
write_and_fix:
if (e2fsck_dir_will_be_rehashed(ctx, ino))
diff --git a/e2fsck/pass4.c b/e2fsck/pass4.c
index 21d93f0..959dfc3 100644
--- a/e2fsck/pass4.c
+++ b/e2fsck/pass4.c
@@ -87,6 +87,21 @@ static int disconnect_inode(e2fsck_t ctx, ext2_ino_t i,
return 0;
}
+/* Since pass4 is mostly CPU bound, start readahead of bitmaps for pass 5. */
+static void *pass5_readahead(void *p)
+{
+ ext2_filsys fs = p;
+
+ ext2fs_readahead(fs, EXT2_READA_BBITMAP | EXT2_READA_IBITMAP, 0,
+ fs->group_desc_count);
+ return NULL;
+}
+
+static errcode_t initiate_readahead(e2fsck_t ctx)
+{
+ return e2fsck_run_thread(&ctx->ra_thread, pass5_readahead, NULL,
+ ctx->fs);
+}
void e2fsck_pass4(e2fsck_t ctx)
{
@@ -100,12 +115,19 @@ void e2fsck_pass4(e2fsck_t ctx)
__u16 link_count, link_counted;
char *buf = 0;
dgrp_t group, maxgroup;
+ errcode_t err;
init_resource_track(&rtrack, ctx->fs->io);
#ifdef MTRACE
mtrace_print("Pass 4");
#endif
+ if (ctx->readahead_mem_kb) {
+ err = initiate_readahead(ctx);
+ if (err)
+ com_err(ctx->program_name, err, "%s",
+ _("while starting pass5 readahead"));
+ }
clear_problem_context(&pctx);
diff --git a/e2fsck/prof_err.et b/e2fsck/prof_err.et
index c9316c7..21fb524 100644
--- a/e2fsck/prof_err.et
+++ b/e2fsck/prof_err.et
@@ -62,5 +62,6 @@ error_code PROF_BAD_INTEGER, "Invalid integer value"
error_code PROF_MAGIC_FILE_DATA, "Bad magic value in profile_file_data_t"
+error_code E2FSCK_ET_MAGIC_RUN_THREAD, "Wrong magic number for e2fsck_thread structure"
end
diff --git a/e2fsck/rehash.c b/e2fsck/rehash.c
index 3b05715..89708c2 100644
--- a/e2fsck/rehash.c
+++ b/e2fsck/rehash.c
@@ -71,6 +71,16 @@ int e2fsck_dir_will_be_rehashed(e2fsck_t ctx, ext2_ino_t ino)
return ext2fs_u32_list_test(ctx->dirs_to_hash, ino);
}
+/* Ask if there will be a pass 3A. */
+int e2fsck_will_rehash_dirs(e2fsck_t ctx)
+{
+ if (ctx->options & E2F_OPT_COMPRESS_DIRS)
+ return 1;
+ if (!ctx->dirs_to_hash)
+ return 0;
+ return ext2fs_u32_list_count(ctx->dirs_to_hash) > 0;
+}
+
struct fill_dir_struct {
char *buf;
struct ext2_inode *inode;
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index 80ebdb1..d6ef8c5 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -74,7 +74,7 @@ static void usage(e2fsck_t ctx)
_("Usage: %s [-panyrcdfvtDFV] [-b superblock] [-B blocksize]\n"
"\t\t[-I inode_buffer_blocks] [-P process_inode_size]\n"
"\t\t[-l|-L bad_blocks_file] [-C fd] [-j external_journal]\n"
- "\t\t[-E extended-options] device\n"),
+ "\t\t[-E extended-options] [-R readahead_kb] device\n"),
ctx->program_name);
fprintf(stderr, "%s", _("\nEmergency help:\n"
@@ -90,6 +90,7 @@ static void usage(e2fsck_t ctx)
" -j external_journal Set location of the external journal\n"
" -l bad_blocks_file Add to badblocks list\n"
" -L bad_blocks_file Set badblocks list\n"
+ " -R readahead_kb Allow this much readahead.\n"
));
exit(FSCK_USAGE);
@@ -749,6 +750,7 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
#ifdef CONFIG_JBD_DEBUG
char *jbd_debug;
#endif
+ unsigned long long phys_mem_kb, reada_kb;
retval = e2fsck_allocate_context(&ctx);
if (retval)
@@ -776,8 +778,16 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
else
ctx->program_name = "e2fsck";
- while ((c = getopt (argc, argv, "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDk")) != EOF)
+ phys_mem_kb = get_memory_size() / 1024;
+ reada_kb = ~0ULL;
+ while ((c = getopt(argc, argv,
+ "panyrcC:B:dE:fvtFVM:b:I:j:P:l:L:N:SsDkR:")) != EOF)
switch (c) {
+ case 'R':
+ res = sscanf(optarg, "%llu", &reada_kb);
+ if (res != 1)
+ goto sscanf_err;
+ break;
case 'C':
ctx->progress = e2fsck_update_progress;
res = sscanf(optarg, "%d", &ctx->progress_fd);
@@ -965,6 +975,22 @@ static errcode_t PRS(int argc, char *argv[], e2fsck_t *ret_ctx)
if (c)
verbose = 1;
+ /* Figure out how much memory goes to readahead */
+ profile_get_integer(ctx->profile, "options", "readahead_mem_pct", 0,
+ 50, &c);
+ if (c >= 0 && c <= 100)
+ ctx->readahead_mem_kb = phys_mem_kb * c / 100;
+ else
+ ctx->readahead_mem_kb = phys_mem_kb / 2;
+ profile_get_integer(ctx->profile, "options", "readahead_mem_kb", 0,
+ -1, &c);
+ if (c >= 0)
+ ctx->readahead_mem_kb = c;
+ if (reada_kb != ~0ULL)
+ ctx->readahead_mem_kb = reada_kb;
+ if (ctx->readahead_mem_kb > phys_mem_kb)
+ ctx->readahead_mem_kb = phys_mem_kb;
+
/* Turn off discard in read-only mode */
if ((ctx->options & E2F_OPT_NO) &&
(ctx->options & E2F_OPT_DISCARD))
@@ -1782,6 +1808,11 @@ no_journal:
}
}
+ retval = e2fsck_stop_thread(&ctx->ra_thread, NULL);
+ if (retval)
+ com_err(ctx->program_name, retval, "%s",
+ _("while stopping readahead"));
+
e2fsck_write_bitmaps(ctx);
io_channel_flush(ctx->fs->io);
print_resource_track(ctx, NULL, &ctx->global_rtrack, ctx->fs->io);
diff --git a/e2fsck/util.c b/e2fsck/util.c
index fec6179..09b78c2 100644
--- a/e2fsck/util.c
+++ b/e2fsck/util.c
@@ -37,6 +37,10 @@
#include <errno.h>
#endif
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
#include "e2fsck.h"
extern e2fsck_t e2fsck_global_ctx; /* Try your very best not to use this! */
@@ -845,3 +849,50 @@ errcode_t e2fsck_allocate_subcluster_bitmap(ext2_filsys fs, const char *descr,
fs->default_bitmap_type = save_type;
return retval;
}
+
+/* Return memory size in bytes */
+int64_t get_memory_size(void)
+{
+#if defined(_SC_PHYS_PAGES)
+# if defined(_SC_PAGESIZE)
+ return (int64_t)sysconf(_SC_PHYS_PAGES) *
+ (int64_t)sysconf(_SC_PAGESIZE);
+# elif defined(_SC_PAGE_SIZE)
+ return (int64_t)sysconf(_SC_PHYS_PAGES) *
+ (int64_t)sysconf(_SC_PAGE_SIZE);
+# endif
+#elif defined(_SC_AIX_REALMEM)
+ return (int64_t)sysconf(_SC_AIX_REALMEM) * (int64_t)1024L;
+#elif defined(CTL_HW)
+# if (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
+# define CTL_HW_INT64
+# elif (defined(HW_PHYSMEM) || defined(HW_REALMEM))
+# define CTL_HW_UINT
+# endif
+ int mib[2];
+ mib[0] = CTL_HW;
+# if defined(HW_MEMSIZE)
+ mib[1] = HW_MEMSIZE;
+# elif defined(HW_PHYSMEM64)
+ mib[1] = HW_PHYSMEM64;
+# elif defined(HW_REALMEM)
+ mib[1] = HW_REALMEM;
+# elif defined(HW_PYSMEM)
+ mib[1] = HW_PHYSMEM;
+# endif
+# if defined(CTL_HW_INT64)
+ int64_t size = 0;
+# elif defined(CTL_HW_UINT)
+ unsigned int size = 0;
+# endif
+# if defined(CTL_HW_INT64) || defined(CTL_HW_UINT)
+ size_t len = sizeof(size);
+ if (sysctl(mib, 2, &size, &len, NULL, 0) == 0)
+ return (int64_t)size;
+# endif
+ return 0;
+#else
+# warning "Don't know how to detect memory on your platform?"
+ return 0;
+#endif
+}
diff --git a/lib/config.h.in b/lib/config.h.in
index e0384ee..836c2df 100644
--- a/lib/config.h.in
+++ b/lib/config.h.in
@@ -203,6 +203,9 @@
/* Define if your <locale.h> file defines LC_MESSAGES. */
#undef HAVE_LC_MESSAGES
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#undef HAVE_LIBPTHREAD
+
/* Define to 1 if you have the <limits.h> header file. */
#undef HAVE_LIMITS_H
@@ -314,6 +317,9 @@
/* Define to 1 if you have the `pread' function. */
#undef HAVE_PREAD
+/* Define to 1 if you have the <pthread.h> header file. */
+#undef HAVE_PTHREAD_H
+
/* Define to 1 if you have the `putenv' function. */
#undef HAVE_PUTENV
@@ -465,6 +471,9 @@
/* Define to 1 if you have the <sys/syscall.h> header file. */
#undef HAVE_SYS_SYSCALL_H
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
/* Define to 1 if you have the <sys/sysmacros.h> header file. */
#undef HAVE_SYS_SYSMACROS_H
next prev parent reply other threads:[~2014-03-11 6:58 UTC|newest]
Thread overview: 88+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-11 6:53 [PATCH 00/49] e2fsprogs patchbomb 3/14 Darrick J. Wong
2014-03-11 6:54 ` [PATCH 01/49] create_inode: clean up return mess in do_write_internal Darrick J. Wong
2014-03-11 20:30 ` Andreas Dilger
2014-03-11 20:41 ` Darrick J. Wong
2014-03-11 21:08 ` Theodore Ts'o
2014-03-12 3:24 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 02/49] create_inode: minor cleanups Darrick J. Wong
2014-03-11 20:31 ` Andreas Dilger
2014-03-12 3:25 ` Theodore Ts'o
2014-03-12 3:27 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 03/49] create_inode: whitespace fixes Darrick J. Wong
2014-03-12 3:27 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 04/49] create_inode: move debugfs internal state back to debugfs Darrick J. Wong
2014-03-12 3:31 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 05/49] create_inode: handle hard link inum mappings per populate_fs invocation Darrick J. Wong
2014-03-12 3:46 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 06/49] libext2fs: support modifying arbitrary extended attributes (v5) Darrick J. Wong
2014-03-12 3:51 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 07/49] debugfs: create commands to edit extended attributes Darrick J. Wong
2014-03-12 3:51 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 08/49] e2fsck: don't rehash inline directories Darrick J. Wong
2014-03-13 3:52 ` Theodore Ts'o
2014-03-13 5:38 ` Darrick J. Wong
2014-03-13 12:13 ` Theodore Ts'o
2014-03-11 6:54 ` [PATCH 09/49] libext2fs: don't fail when doing a strict rewrite of inline data Darrick J. Wong
2014-03-14 13:19 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 10/49] libext2fs: fix iblocks correctly when expanding an inline_data file Darrick J. Wong
2014-03-12 16:38 ` Andreas Dilger
2014-03-12 17:01 ` Darrick J. Wong
2014-03-14 13:25 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 11/49] e2fsck: zero errcode when checking inline data blocks Darrick J. Wong
2014-03-14 13:26 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 12/49] libext2fs: during inlinedata expand, don't corrupt inode Darrick J. Wong
2014-03-14 13:29 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 13/49] libext2fs: repair side effects when iterating dirents in inline dirs Darrick J. Wong
2014-03-14 13:30 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 14/49] resize2fs: add inline dirs for remapping Darrick J. Wong
2014-03-14 13:31 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 15/49] all: Introduce cppcheck static checking for make C=1 Darrick J. Wong
2014-03-14 13:33 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 16/49] misc: cppcheck cleanups Darrick J. Wong
2014-03-14 13:34 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 17/49] libext2fs: fix 64bit overflow in ext2fs_block_alloc_stats_range Darrick J. Wong
2014-03-14 13:35 ` Theodore Ts'o
2014-03-11 6:55 ` [PATCH 18/49] misc: fix header complaints and resource leaks in e2fsprogs Darrick J. Wong
2014-03-14 13:39 ` Theodore Ts'o
2014-03-14 13:53 ` Theodore Ts'o
2014-03-14 19:23 ` Darrick J. Wong
2014-03-11 6:55 ` [PATCH 19/49] libext2fs: fix memory leak when drastically shrinking extent tree depth Darrick J. Wong
2014-03-14 13:56 ` Theodore Ts'o
2014-03-11 6:56 ` [PATCH 20/49] libext2fs: fix parents when modifying extents Darrick J. Wong
2014-03-14 14:01 ` Theodore Ts'o
2014-03-14 20:13 ` Darrick J. Wong
2014-03-15 15:46 ` Theodore Ts'o
2014-03-17 16:59 ` Darrick J. Wong
2014-03-11 6:56 ` [PATCH 21/49] e2fsck: print runs of duplicate blocks instead of all of them Darrick J. Wong
2014-03-15 16:19 ` Theodore Ts'o
2014-03-11 6:56 ` [PATCH 22/49] e2fsck: verify checksums after checking everything else Darrick J. Wong
2014-03-11 6:56 ` [PATCH 23/49] e2fsck: fix the extended attribute checksum error message Darrick J. Wong
2014-03-11 6:56 ` [PATCH 24/49] e2fsck: insert a missing dirent tail for checksums if possible Darrick J. Wong
2014-03-11 6:56 ` [PATCH 25/49] e2fsck: write dir blocks after new inode when reconstructing root/lost+found Darrick J. Wong
2014-03-11 6:56 ` [PATCH 26/49] tests: add test for corrupted checksummed root directory block Darrick J. Wong
2014-03-11 6:56 ` [PATCH 27/49] dumpe2fs: add switch to disable checksum verification Darrick J. Wong
2014-03-11 6:56 ` [PATCH 28/49] mke2fs: set block_validity as a default mount option Darrick J. Wong
2014-03-11 6:57 ` [PATCH 29/49] libext2fs: support allocating uninit blocks in bmap2() Darrick J. Wong
2014-03-11 6:57 ` [PATCH 30/49] libext2fs: file IO routines should handle uninit blocks Darrick J. Wong
2014-03-11 6:57 ` [PATCH 31/49] resize2fs: convert fs to and from 64bit mode Darrick J. Wong
2014-03-11 6:57 ` [PATCH 32/49] resize2fs: when toggling 64bit, don't free in-use bg data clusters Darrick J. Wong
2014-03-11 6:57 ` [PATCH 33/49] resize2fs: adjust reserved_gdt_blocks when changing group descriptor size Darrick J. Wong
2014-03-11 6:57 ` [PATCH 34/49] libext2fs: have UNIX IO manager use pread/pwrite Darrick J. Wong
2014-03-11 6:57 ` [PATCH 35/49] ext2fs: add readahead method to improve scanning Darrick J. Wong
2014-03-17 22:07 ` Andreas Dilger
2014-03-11 6:57 ` [PATCH 36/49] libext2fs: allow clients to read-ahead metadata Darrick J. Wong
2014-03-17 23:11 ` Andreas Dilger
2014-03-11 6:57 ` Darrick J. Wong [this message]
2014-03-17 23:10 ` [PATCH 37/49] e2fsck: read-ahead metadata during passes 1, 2, and 4 Andreas Dilger
2014-03-18 4:42 ` Darrick J. Wong
2014-03-18 6:50 ` Darrick J. Wong
2014-03-11 6:58 ` [PATCH 38/49] libext2fs: when appending to a file, don't split an index block in equal halves Darrick J. Wong
2014-03-11 6:58 ` [PATCH 39/49] libext2fs: find inode goal when allocating blocks Darrick J. Wong
2014-03-11 6:58 ` [PATCH 40/49] libext2fs: find a range of empty blocks Darrick J. Wong
2014-03-11 6:58 ` [PATCH 41/49] libext2fs: provide a function to set inode size Darrick J. Wong
2014-03-11 6:58 ` [PATCH 42/49] libext2fs: implement fallocate Darrick J. Wong
2014-03-11 6:58 ` [PATCH 44/49] fuse2fs: translate ACL structures Darrick J. Wong
2014-03-11 6:58 ` [PATCH 45/49] fuse2fs: handle 64-bit dates correctly Darrick J. Wong
2014-03-11 6:58 ` [PATCH 46/49] fuse2fs: implement fallocate Darrick J. Wong
2014-03-11 6:59 ` [PATCH 48/49] tests: enable using fuse2fs with metadata checksum test Darrick J. Wong
2014-03-11 6:59 ` [PATCH 49/49] tests: test date handling Darrick J. Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140311065757.30585.46729.stgit@birch.djwong.org \
--to=darrick.wong@oracle.com \
--cc=linux-ext4@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).