linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mke2fs: use lazy inode init on some discard-able devices
@ 2010-08-20 21:41 Eric Sandeen
  2010-08-23 10:49 ` Theodore Tso
  2010-09-20 13:23 ` Ted Ts'o
  0 siblings, 2 replies; 7+ messages in thread
From: Eric Sandeen @ 2010-08-20 21:41 UTC (permalink / raw)
  To: ext4 development

If a device supports discard -and- returns 0s for discarded blocks,
then we can skip the inode table initialization -and- the inode table
zeroing at mkfs time, and skip the lazy init as well since they are
already zeroed out.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---

diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index add7c0c..b7a9e12 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -88,7 +88,8 @@ int	force;
 int	noaction;
 int	journal_size;
 int	journal_flags;
-int	lazy_itable_init;
+int	lazy_itable_init;	/* use lazy inode table init */
+int	lazy_itable_zeroed;	/* inode table zeroed by discard */
 char	*bad_blocks_filename;
 __u32	fs_stride;
 
@@ -300,7 +301,7 @@ _("Warning: the backup superblock/group descriptors at block %u contain\n"
 	ext2fs_badblocks_list_iterate_end(bb_iter);
 }
 
-static void write_inode_tables(ext2_filsys fs, int lazy_flag)
+static void write_inode_tables(ext2_filsys fs, int lazy_flag, int lazy_zeroed)
 {
 	errcode_t	retval;
 	blk64_t		blk;
@@ -325,6 +326,9 @@ static void write_inode_tables(ext2_filsys fs, int lazy_flag)
 				 EXT2_INODE_SIZE(fs->super)) +
 				EXT2_BLOCK_SIZE(fs->super) - 1) /
 			       EXT2_BLOCK_SIZE(fs->super));
+			/* if pre-zeroed by discard, mark as such */
+			if (lazy_zeroed)
+				ext2fs_bg_flags_set(fs, i, EXT2_BG_INODE_ZEROED);
 		} else {
 			/* The kernel doesn't need to zero the itable blocks */
 			ext2fs_bg_flags_set(fs, i, EXT2_BG_INODE_ZEROED);
@@ -1901,7 +1905,11 @@ static int mke2fs_setup_tdb(const char *name, io_manager *io_ptr)
 #define BLKDISCARD	_IO(0x12,119)
 #endif
 
-static void mke2fs_discard_blocks(ext2_filsys fs)
+#ifndef BLKDISCARDZEROES
+#define BLKDISCARDZEROES _IO(0x12,124)
+#endif
+ 
+static int mke2fs_discard_blocks(ext2_filsys fs)
 {
 	int fd;
 	int ret;
@@ -1917,8 +1925,8 @@ static void mke2fs_discard_blocks(ext2_filsys fs)
 	fd = open64(fs->device_name, O_RDWR);
 
 	/*
-	 * We don't care about whether the ioctl succeeds; it's only an
-	 * optmization for SSDs or sparse storage.
+	 * We don't much care about whether the ioctl succeeds; it's only
+	 * an optmization for SSDs or thinly-provisioned storage.
 	 */
 	if (fd > 0) {
 		ret = ioctl(fd, BLKDISCARD, &range);
@@ -1933,9 +1941,26 @@ static void mke2fs_discard_blocks(ext2_filsys fs)
 		}
 		close(fd);
 	}
+	return ret;
+}
+
+static int mke2fs_discard_zeroes_data(ext2_filsys fs)
+{
+	int fd;
+	int ret;
+	int discard_zeroes_data = 0;
+
+	fd = open64(fs->device_name, O_RDWR);
+
+	if (fd > 0) {
+		ioctl(fd, BLKDISCARDZEROES, &discard_zeroes_data);
+		close(fd);
+	}
+	return discard_zeroes_data;
 }
 #else
-#define mke2fs_discard_blocks(fs)
+#define mke2fs_discard_blocks(fs)	1
+#define mke2fs_discard_zeroes_data(fs)	0
 #endif
 
 int main (int argc, char *argv[])
@@ -1996,8 +2021,17 @@ int main (int argc, char *argv[])
 	}
 
 	/* Can't undo discard ... */
-	if (discard && (io_ptr != undo_io_manager))
-		mke2fs_discard_blocks(fs);
+	if (discard && (io_ptr != undo_io_manager)) {
+		retval = mke2fs_discard_blocks(fs);
+
+		if (!retval && mke2fs_discard_zeroes_data(fs)) {
+			if (verbose)
+				printf(_("Discard succeeded and will return 0s "
+					 " - enabling lazy_itable_init\n"));
+			lazy_itable_init = 1;
+			lazy_itable_zeroed = 1;
+		}
+	}
 
 	sprintf(tdb_string, "tdb_data_size=%d", fs->blocksize <= 4096 ?
 		32768 : fs->blocksize * 8);
@@ -2147,7 +2181,7 @@ int main (int argc, char *argv[])
 				_("while zeroing block %llu at end of filesystem"),
 				ret_blk);
 		}
-		write_inode_tables(fs, lazy_itable_init);
+		write_inode_tables(fs, lazy_itable_init, lazy_itable_zeroed);
 		create_root_dir(fs);
 		create_lost_and_found(fs);
 		reserve_inodes(fs);


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] mke2fs: use lazy inode init on some discard-able devices
  2010-08-20 21:41 [PATCH] mke2fs: use lazy inode init on some discard-able devices Eric Sandeen
@ 2010-08-23 10:49 ` Theodore Tso
  2010-08-23 14:32   ` Eric Sandeen
  2010-09-20 13:23 ` Ted Ts'o
  1 sibling, 1 reply; 7+ messages in thread
From: Theodore Tso @ 2010-08-23 10:49 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: ext4 development


On Aug 20, 2010, at 5:41 PM, Eric Sandeen wrote:

> If a device supports discard -and- returns 0s for discarded blocks,
> then we can skip the inode table initialization -and- the inode table
> zeroing at mkfs time, and skip the lazy init as well since they are
> already zeroed out.
> 
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>

This needs to be configurable in /etc/mke2fs.conf.  Without naming
the manufacturer, I'm aware of at least one device which claims 
that discard works, and will even return zeros --- but after a
power cycle, if the block has not been reallocated, will once again
return the old, pre-discard values that had been stored in that block.

In other words, the discard is not power-cycle persistent...

-- Ted



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mke2fs: use lazy inode init on some discard-able devices
  2010-08-23 10:49 ` Theodore Tso
@ 2010-08-23 14:32   ` Eric Sandeen
  2010-08-24  0:27     ` Andreas Dilger
  0 siblings, 1 reply; 7+ messages in thread
From: Eric Sandeen @ 2010-08-23 14:32 UTC (permalink / raw)
  To: Theodore Tso; +Cc: ext4 development

Theodore Tso wrote:
> On Aug 20, 2010, at 5:41 PM, Eric Sandeen wrote:
> 
>> If a device supports discard -and- returns 0s for discarded blocks,
>> then we can skip the inode table initialization -and- the inode table
>> zeroing at mkfs time, and skip the lazy init as well since they are
>> already zeroed out.
>> 
>> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> 
> This needs to be configurable in /etc/mke2fs.conf.  Without naming 
> the manufacturer, I'm aware of at least one device which claims that
> discard works, and will even return zeros --- but after a power
> cycle, if the block has not been reallocated, will once again return
> the old, pre-discard values that had been stored in that block.
> 
> In other words, the discard is not power-cycle persistent...
> 
> -- Ted
> 
> 

yes, I've seen issues like that too.

TBH in that case I'd rather just drop the patch than make another
tunable for the user to figure out...

Making tunables for every permutation of broken hardware doesn't 
scale IMHO. Users will get it wrong often as not, if they even know 
it's there (they'll find out it's there via some web forum or other, 
and it'll become a meme like "set this to go faster" rather than 
understanding all the implications.)

-Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mke2fs: use lazy inode init on some discard-able devices
  2010-08-23 14:32   ` Eric Sandeen
@ 2010-08-24  0:27     ` Andreas Dilger
  0 siblings, 0 replies; 7+ messages in thread
From: Andreas Dilger @ 2010-08-24  0:27 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Theodore Tso, ext4 development

On 2010-08-23, at 08:32, Eric Sandeen wrote:
> Theodore Tso wrote:
>> On Aug 20, 2010, at 5:41 PM, Eric Sandeen wrote:
>> 
>>> If a device supports discard -and- returns 0s for discarded blocks,
>>> then we can skip the inode table initialization -and- the inode table
>>> zeroing at mkfs time, and skip the lazy init as well since they are
>>> already zeroed out.
>>> 
>>> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
>> 
>> This needs to be configurable in /etc/mke2fs.conf.  Without naming 
>> the manufacturer, I'm aware of at least one device which claims that
>> discard works, and will even return zeros --- but after a power
>> cycle, if the block has not been reallocated, will once again return
>> the old, pre-discard values that had been stored in that block.
>> 
>> In other words, the discard is not power-cycle persistent...
> 
> yes, I've seen issues like that too.
> 
> TBH in that case I'd rather just drop the patch than make another
> tunable for the user to figure out...

What else we discussed is to have mke2fs validate whether the discard+zero works (i.e. write some small non-zero data, discard the whole device, read back previously written data).  Granted it wouldn't handle this "it only fails after a power-cycle" problem, but it should detect gratuitously broken hardware.

That should be sufficiently safe until such a time that inode checksums are available.  Note also that non-zero inode table blocks are only a major problem if some additional corruption causes the itable_uninit numbers to become invalid (e.g. bad group checksum) at which case the old itable blocks will be used.

We also discussed doing this at least for sparse files, which makes a lot of sense for testing in any case, even if the default is not to do this for SSD devices until they smarten up.


Cheers, Andreas






^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mke2fs: use lazy inode init on some discard-able devices
  2010-08-20 21:41 [PATCH] mke2fs: use lazy inode init on some discard-able devices Eric Sandeen
  2010-08-23 10:49 ` Theodore Tso
@ 2010-09-20 13:23 ` Ted Ts'o
  2010-09-21  5:15   ` Andreas Dilger
  1 sibling, 1 reply; 7+ messages in thread
From: Ted Ts'o @ 2010-09-20 13:23 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: ext4 development

On Fri, Aug 20, 2010 at 04:41:14PM -0500, Eric Sandeen wrote:
> If a device supports discard -and- returns 0s for discarded blocks,
> then we can skip the inode table initialization -and- the inode table
> zeroing at mkfs time, and skip the lazy init as well since they are
> already zeroed out.
> 
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>

Applied, with some minor changes so it can apply against the "maint"
branch, and to eliminate a global variable.

					- Ted

commit 6fcd6f84c235f4bf2bd9770f172837da9982eb6e
Author: Eric Sandeen <sandeen@redhat.com>
Date:   Fri Aug 20 16:41:14 2010 -0500

    mke2fs: use lazy inode init on some discard-able devices
    
    If a device supports discard -and- returns 0s for discarded blocks,
    then we can skip the inode table initialization -and- the inode table
    zeroing at mkfs time, and skip the lazy init as well since they are
    already zeroed out.
    
    Signed-off-by: Eric Sandeen <sandeen@redhat.com>
    Signed-off-by: Theodore Ts'o <tytso@mit.edu>

diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index e725cd1..7c337a0 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -85,7 +85,7 @@ int	force;
 int	noaction;
 int	journal_size;
 int	journal_flags;
-int	lazy_itable_init;
+int	lazy_itable_init;	/* use lazy inode table init */
 char	*bad_blocks_filename;
 __u32	fs_stride;
 
@@ -350,7 +350,7 @@ static void progress_close(struct progress_struct *progress)
 	fputs(_("done                            \n"), stdout);
 }
 
-static void write_inode_tables(ext2_filsys fs, int lazy_flag)
+static void write_inode_tables(ext2_filsys fs, int lazy_flag, int itable_zeroed)
 {
 	errcode_t	retval;
 	blk_t		blk;
@@ -377,7 +377,8 @@ static void write_inode_tables(ext2_filsys fs, int lazy_flag)
 				 EXT2_INODE_SIZE(fs->super)) +
 				EXT2_BLOCK_SIZE(fs->super) - 1) /
 			       EXT2_BLOCK_SIZE(fs->super));
-		} else {
+		}
+		if (!lazy_flag || itable_zeroed) {
 			/* The kernel doesn't need to zero the itable blocks */
 			fs->group_desc[i].bg_flags |= EXT2_BG_INODE_ZEROED;
 			ext2fs_group_desc_csum_set(fs, i);
@@ -1943,7 +1944,14 @@ static int mke2fs_setup_tdb(const char *name, io_manager *io_ptr)
 #define BLKDISCARD	_IO(0x12,119)
 #endif
 
-static void mke2fs_discard_blocks(ext2_filsys fs)
+#ifndef BLKDISCARDZEROES
+#define BLKDISCARDZEROES _IO(0x12,124)
+#endif
+
+/*
+ * Return zero if the discard succeeds, and -1 if the discard fails.
+ */
+static int mke2fs_discard_blocks(ext2_filsys fs)
 {
 	int fd;
 	int ret;
@@ -1958,10 +1966,6 @@ static void mke2fs_discard_blocks(ext2_filsys fs)
 
 	fd = open64(fs->device_name, O_RDWR);
 
-	/*
-	 * We don't care about whether the ioctl succeeds; it's only an
-	 * optmization for SSDs or sparse storage.
-	 */
 	if (fd > 0) {
 		ret = ioctl(fd, BLKDISCARD, &range);
 		if (verbose) {
@@ -1975,9 +1979,26 @@ static void mke2fs_discard_blocks(ext2_filsys fs)
 		}
 		close(fd);
 	}
+	return ret;
+}
+
+static int mke2fs_discard_zeroes_data(ext2_filsys fs)
+{
+	int fd;
+	int ret;
+	int discard_zeroes_data = 0;
+
+	fd = open64(fs->device_name, O_RDWR);
+
+	if (fd > 0) {
+		ioctl(fd, BLKDISCARDZEROES, &discard_zeroes_data);
+		close(fd);
+	}
+	return discard_zeroes_data;
 }
 #else
-#define mke2fs_discard_blocks(fs)
+#define mke2fs_discard_blocks(fs)	1
+#define mke2fs_discard_zeroes_data(fs)	0
 #endif
 
 int main (int argc, char *argv[])
@@ -1991,6 +2012,7 @@ int main (int argc, char *argv[])
 	io_manager	io_ptr;
 	char		tdb_string[40];
 	char		*hash_alg_str;
+	int		itable_zeroed = 0;
 
 #ifdef ENABLE_NLS
 	setlocale(LC_MESSAGES, "");
@@ -2025,8 +2047,17 @@ int main (int argc, char *argv[])
 	}
 
 	/* Can't undo discard ... */
-	if (discard && (io_ptr != undo_io_manager))
-		mke2fs_discard_blocks(fs);
+	if (discard && (io_ptr != undo_io_manager)) {
+		retval = mke2fs_discard_blocks(fs);
+
+		if (!retval && mke2fs_discard_zeroes_data(fs)) {
+			if (verbose)
+				printf(_("Discard succeeded and will return 0s "
+					 " - skipping inode table wipe\n"));
+			lazy_itable_init = 1;
+			itable_zeroed = 1;
+		}
+	}
 
 	sprintf(tdb_string, "tdb_data_size=%d", fs->blocksize <= 4096 ?
 		32768 : fs->blocksize * 8);
@@ -2172,7 +2203,7 @@ int main (int argc, char *argv[])
 				_("while zeroing block %u at end of filesystem"),
 				ret_blk);
 		}
-		write_inode_tables(fs, lazy_itable_init);
+		write_inode_tables(fs, lazy_itable_init, itable_zeroed);
 		create_root_dir(fs);
 		create_lost_and_found(fs);
 		reserve_inodes(fs);

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] mke2fs: use lazy inode init on some discard-able devices
  2010-09-20 13:23 ` Ted Ts'o
@ 2010-09-21  5:15   ` Andreas Dilger
  2010-09-21 16:01     ` Eric Sandeen
  0 siblings, 1 reply; 7+ messages in thread
From: Andreas Dilger @ 2010-09-21  5:15 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Eric Sandeen, ext4 development

On Fri, Aug 20, 2010 at 04:41:14PM -0500, Eric Sandeen wrote:
> If a device supports discard -and- returns 0s for discarded blocks,
> then we can skip the inode table initialization -and- the inode table
> zeroing at mkfs time, and skip the lazy init as well since they are
> already zeroed out.

Eric, do you have my patch for skipping the journal zeroing?  If you are already skipping the inode table zeroing (which is my bigger of a deal) then you may as well skip zeroing the journal at the same time.

Cheers, Andreas






^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mke2fs: use lazy inode init on some discard-able devices
  2010-09-21  5:15   ` Andreas Dilger
@ 2010-09-21 16:01     ` Eric Sandeen
  0 siblings, 0 replies; 7+ messages in thread
From: Eric Sandeen @ 2010-09-21 16:01 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Ted Ts'o, ext4 development

On 09/21/2010 12:15 AM, Andreas Dilger wrote:
> On Fri, Aug 20, 2010 at 04:41:14PM -0500, Eric Sandeen wrote:
>> If a device supports discard -and- returns 0s for discarded blocks,
>> then we can skip the inode table initialization -and- the inode table
>> zeroing at mkfs time, and skip the lazy init as well since they are
>> already zeroed out.
> 
> Eric, do you have my patch for skipping the journal zeroing? If you
are already skipping the inode table zeroing (which is my bigger of a
deal) then you may as well skip zeroing the journal at the same time.

Hm good point.  Might be in my ext4 folder somewhere ;)

-Eric

> Cheers, Andreas


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2010-09-21 16:01 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-08-20 21:41 [PATCH] mke2fs: use lazy inode init on some discard-able devices Eric Sandeen
2010-08-23 10:49 ` Theodore Tso
2010-08-23 14:32   ` Eric Sandeen
2010-08-24  0:27     ` Andreas Dilger
2010-09-20 13:23 ` Ted Ts'o
2010-09-21  5:15   ` Andreas Dilger
2010-09-21 16:01     ` Eric Sandeen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).