linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: tytso@mit.edu, darrick.wong@oracle.com
Cc: linux-ext4@vger.kernel.org
Subject: [PATCH 10/35] undo-io: add new calls to and speed up the undo io manager
Date: Wed, 01 Apr 2015 19:35:06 -0700	[thread overview]
Message-ID: <20150402023506.25243.44459.stgit@birch.djwong.org> (raw)
In-Reply-To: <20150402023359.25243.79782.stgit@birch.djwong.org>

Implement pass-through calls for discard, zero-out, and readahead in
the IO manager so that we can take advantage of any underlying
support.

Furthermore, improve tdb write-out speed by disabling locking and only
fsyncing at the end -- we don't care about locking because having
multiple writers to the undo file will produce an undo database full
of garbage blocks; and we only need to fsync at the end because if we
fail before the end, our undo file will lack the necessary superblock
data that e2undo requires to do replay safely.  Without this, we call
fsync four times per tdb update(!)  This reduces the overhead of using
undo_io while converting a 2TB FS to metadata_csum from 3+ hours to 55
minutes.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 lib/ext2fs/tdb.c     |   10 ++++++
 lib/ext2fs/tdb.h     |    2 +
 lib/ext2fs/undo_io.c |   87 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 97 insertions(+), 2 deletions(-)


diff --git a/lib/ext2fs/tdb.c b/lib/ext2fs/tdb.c
index 1d97685..7317288 100644
--- a/lib/ext2fs/tdb.c
+++ b/lib/ext2fs/tdb.c
@@ -4142,3 +4142,13 @@ int tdb_reopen_all(int parent_longlived)
 
 	return 0;
 }
+
+/**
+ * Flush a database file from the page cache.
+ **/
+int tdb_flush(struct tdb_context *tdb)
+{
+	if (tdb->fd != -1)
+		return fsync(tdb->fd);
+	return 0;
+}
diff --git a/lib/ext2fs/tdb.h b/lib/ext2fs/tdb.h
index 732ef0e..6a4086c 100644
--- a/lib/ext2fs/tdb.h
+++ b/lib/ext2fs/tdb.h
@@ -129,6 +129,7 @@ typedef struct TDB_DATA {
 #define tdb_lockall_nonblock ext2fs_tdb_lockall_nonblock
 #define tdb_lockall_read_nonblock ext2fs_tdb_lockall_read_nonblock
 #define tdb_lockall_unmark ext2fs_tdb_lockall_unmark
+#define tdb_flush ext2fs_tdb_flush
 
 /* this is the context structure that is returned from a db open */
 typedef struct tdb_context TDB_CONTEXT;
@@ -191,6 +192,7 @@ size_t tdb_map_size(struct tdb_context *tdb);
 int tdb_get_flags(struct tdb_context *tdb);
 void tdb_enable_seqnum(struct tdb_context *tdb);
 void tdb_increment_seqnum_nonblock(struct tdb_context *tdb);
+int tdb_flush(struct tdb_context *tdb);
 
 /* Low level locking functions: use with care */
 int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);
diff --git a/lib/ext2fs/undo_io.c b/lib/ext2fs/undo_io.c
index d6beb02..94317cb 100644
--- a/lib/ext2fs/undo_io.c
+++ b/lib/ext2fs/undo_io.c
@@ -37,6 +37,7 @@
 #if HAVE_SYS_RESOURCE_H
 #include <sys/resource.h>
 #endif
+#include <limits.h>
 
 #include "tdb.h"
 
@@ -354,8 +355,12 @@ static errcode_t undo_open(const char *name, int flags, io_channel *channel)
 		data->real = 0;
 	}
 
+	if (data->real)
+		io->flags = (io->flags & ~CHANNEL_FLAGS_DISCARD_ZEROES) |
+			    (data->real->flags & CHANNEL_FLAGS_DISCARD_ZEROES);
+
 	/* setup the tdb file */
-	data->tdb = tdb_open(tdb_file, 0, TDB_CLEAR_IF_FIRST,
+	data->tdb = tdb_open(tdb_file, 0, TDB_CLEAR_IF_FIRST | TDB_NOLOCK | TDB_NOSYNC,
 			     O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 0600);
 	if (!data->tdb) {
 		retval = errno;
@@ -399,8 +404,10 @@ static errcode_t undo_close(io_channel channel)
 		return retval;
 	if (data->real)
 		retval = io_channel_close(data->real);
-	if (data->tdb)
+	if (data->tdb) {
+		tdb_flush(data->tdb);
 		tdb_close(data->tdb);
+	}
 	ext2fs_free_mem(&channel->private_data);
 	if (channel->name)
 		ext2fs_free_mem(&channel->name);
@@ -510,6 +517,77 @@ static errcode_t undo_write_byte(io_channel channel, unsigned long offset,
 	return retval;
 }
 
+static errcode_t undo_discard(io_channel channel, unsigned long long block,
+			      unsigned long long count)
+{
+	struct undo_private_data *data;
+	errcode_t	retval = 0;
+	int icount;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	data = (struct undo_private_data *) channel->private_data;
+	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+	if (count > INT_MAX)
+		return EXT2_ET_UNIMPLEMENTED;
+	icount = count;
+
+	/*
+	 * First write the existing content into database
+	 */
+	retval = undo_write_tdb(channel, block, icount);
+	if (retval)
+		return retval;
+	if (data->real)
+		retval = io_channel_discard(data->real, block, count);
+
+	return retval;
+}
+
+static errcode_t undo_zeroout(io_channel channel, unsigned long long block,
+			      unsigned long long count)
+{
+	struct undo_private_data *data;
+	errcode_t	retval = 0;
+	int icount;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	data = (struct undo_private_data *) channel->private_data;
+	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+	if (count > INT_MAX)
+		return EXT2_ET_UNIMPLEMENTED;
+	icount = count;
+
+	/*
+	 * First write the existing content into database
+	 */
+	retval = undo_write_tdb(channel, block, icount);
+	if (retval)
+		return retval;
+	if (data->real)
+		retval = io_channel_zeroout(data->real, block, count);
+
+	return retval;
+}
+
+static errcode_t undo_cache_readahead(io_channel channel,
+				      unsigned long long block,
+				      unsigned long long count)
+{
+	struct undo_private_data *data;
+	errcode_t	retval = 0;
+
+	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	data = (struct undo_private_data *) channel->private_data;
+	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+	if (data->real)
+		retval = io_channel_cache_readahead(data->real, block, count);
+
+	return retval;
+}
+
 /*
  * Flush data buffers to disk.
  */
@@ -522,6 +600,8 @@ static errcode_t undo_flush(io_channel channel)
 	data = (struct undo_private_data *) channel->private_data;
 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
 
+	if (data->tdb)
+		tdb_flush(data->tdb);
 	if (data->real)
 		retval = io_channel_flush(data->real);
 
@@ -601,6 +681,9 @@ static struct struct_io_manager struct_undo_manager = {
 	.get_stats	= undo_get_stats,
 	.read_blk64	= undo_read_blk64,
 	.write_blk64	= undo_write_blk64,
+	.discard	= undo_discard,
+	.zeroout	= undo_zeroout,
+	.cache_readahead	= undo_cache_readahead,
 };
 
 io_manager undo_io_manager = &struct_undo_manager;


  parent reply	other threads:[~2015-04-02  2:35 UTC|newest]

Thread overview: 70+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-04-02  2:34 [PATCH 00/35] e2fsprogs April 2015 patchbomb Darrick J. Wong
2015-04-02  2:34 ` [PATCH 01/35] e2fuzz: fuzz harder Darrick J. Wong
2015-04-21  1:47   ` Theodore Ts'o
2015-04-02  2:34 ` [PATCH 02/35] e2fsck: turn inline data symlink into a fast symlink when possible Darrick J. Wong
2015-04-21  1:47   ` Theodore Ts'o
2015-04-02  2:34 ` [PATCH 03/35] libext2fs/e2fsck: provide routines to read-ahead metadata Darrick J. Wong
2015-04-21  3:03   ` Theodore Ts'o
2015-04-02  2:34 ` [PATCH 04/35] e2fsck: read-ahead metadata during passes 1, 2, and 4 Darrick J. Wong
2015-04-21  3:03   ` Theodore Ts'o
2015-04-02  2:34 ` [PATCH 05/35] e2fsck: track directories to be rehashed with a bitmap Darrick J. Wong
2015-04-21  2:26   ` Theodore Ts'o
2015-04-21  4:43     ` Darrick J. Wong
2015-04-21 14:06       ` Theodore Ts'o
2015-04-02  2:34 ` [PATCH 06/35] e2fsck: rebuild sparse extent trees/convert non-extent ext3 files Darrick J. Wong
2015-04-21 16:33   ` Theodore Ts'o
2015-04-02  2:34 ` [PATCH 07/35] e2fsck: convert block-mapped files to extents on bigalloc fs Darrick J. Wong
2015-04-21 14:36   ` Theodore Ts'o
2015-05-05 22:45     ` Darrick J. Wong
2015-04-02  2:34 ` [PATCH 08/35] tests: verify proper rebuilding of sparse extent trees and block map file conversion Darrick J. Wong
2015-04-21 14:47   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 09/35] e2fsck: abort on read error beyond end of FS Darrick J. Wong
2015-04-02  4:10   ` Andreas Dilger
     [not found]     ` <20150402060021.GP11031@birch.djwong.org>
     [not found]       ` <10D33B1F-52B7-4242-9A67-FB9E1CE75296@dilger.ca>
2015-04-06 18:57         ` Darrick J. Wong
2015-04-02  2:35 ` Darrick J. Wong [this message]
2015-04-02  4:06   ` [PATCH 10/35] undo-io: add new calls to and speed up the undo io manager Andreas Dilger
2015-04-21 15:00     ` Theodore Ts'o
2015-04-21 16:48       ` Theodore Ts'o
2015-04-22  2:47         ` Darrick J. Wong
2015-05-05 14:20   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 11/35] undo-io: be more flexible about setting block size Darrick J. Wong
2015-05-05 14:21   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 12/35] undo-io: use a bitmap to track what we've already written Darrick J. Wong
2015-05-05 14:21   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 13/35] e2undo: fix memory leaks and tweak the error messages somewhat Darrick J. Wong
2015-05-05 14:22   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 14/35] e2undo: ditch tdb file, write everything to a flat file Darrick J. Wong
2015-05-05 14:24   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 15/35] libext2fs: support atexit cleanups Darrick J. Wong
2015-05-05 14:31   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 16/35] e2fsck: optionally create an undo file Darrick J. Wong
2015-05-05 14:07   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 17/35] resize2fs: optionally create " Darrick J. Wong
2015-05-05 14:36   ` Theodore Ts'o
2015-04-02  2:35 ` [PATCH 18/35] tune2fs: " Darrick J. Wong
2015-05-05 14:36   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 19/35] mke2fs: " Darrick J. Wong
2015-05-05 14:37   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 20/35] debugfs: " Darrick J. Wong
2015-05-05 14:43   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 21/35] tests: test undo file creation in e2fsck/resize2fs/tune2fs/mke2fs Darrick J. Wong
2015-05-05 14:43   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 22/35] tests: test various features of the new e2undo format Darrick J. Wong
2015-05-05 14:44   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 23/35] copy-in: create hardlinks with the correct directory filetype Darrick J. Wong
2015-05-05 14:46   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 24/35] copy-in: for files, only iterate file blocks that are mapped Darrick J. Wong
2015-05-05 14:49   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 25/35] copyin: fix error handling Darrick J. Wong
2015-05-05 14:51   ` Theodore Ts'o
2015-04-02  2:36 ` [PATCH 26/35] mke2fs: add simple tests and re-alphabetize mke2fs manpage options Darrick J. Wong
2015-05-05 14:52   ` Theodore Ts'o
2015-04-02  2:37 ` [PATCH 27/35] contrib: script to create minified ext4 image from a directory Darrick J. Wong
2015-05-05 14:52   ` Theodore Ts'o
2015-04-02  2:37 ` [PATCH 28/35] libext2fs: support allocating uninit blocks in bmap2() Darrick J. Wong
2015-04-02  2:37 ` [PATCH 29/35] libext2fs: find/alloc a range of empty blocks Darrick J. Wong
2015-04-02  2:37 ` [PATCH 30/35] libext2fs: add new hooks to support large allocations Darrick J. Wong
2015-04-02  2:37 ` [PATCH 31/35] libext2fs: implement fallocate Darrick J. Wong
2015-04-02  2:37 ` [PATCH 32/35] libext2fs: use fallocate for creating journals and hugefiles Darrick J. Wong
2015-04-02  2:37 ` [PATCH 33/35] debugfs: implement fallocate Darrick J. Wong
2015-04-02  2:37 ` [PATCH 34/35] tests: test debugfs punch command Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150402023506.25243.44459.stgit@birch.djwong.org \
    --to=darrick.wong@oracle.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).