[PATCH 2/2] Add batched discard support for ext4.

linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 2/2] Add batched discard support for ext4.
  2010-04-19 10:55 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
@ 2010-04-19 10:55   ` Lukas Czerner
  2010-04-20 21:21     ` Greg Freemyer
  0 siblings, 1 reply; 53+ messages in thread
From: Lukas Czerner @ 2010-04-19 10:55 UTC (permalink / raw)
  To: linux-ext4
  Cc: Jeff Moyer, Edward Shishkin, Eric Sandeen, Ric Wheeler,
	Lukas Czerner

Create an ioctl which walks through all the free extents in each
allocating group and discard those extents. As an addition to improve
its performance one can specify minimum free extent length, so ioctl
will not bother with shorter extents.

This of course means, that with each invocation the ioctl must walk
through whole file system, checking and discarding free extents, which
is not very efficient. The best way to avoid this is to keep track of
deleted (freed) blocks. Then the ioctl have to trim just those free
extents which were recently freed.

In order to implement this I have added new bitmap into ext4_group_info
(bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
walk through bb_bitmap_deleted, compare deleted extents with free
extents trim them and then removes it from the bb_bitmap_deleted. 

But you may notice, that there is one problem. bb_bitmap_deleted does
not survive umount. To bypass the problem the first ioctl call have to
walk through whole file system trimming all free extents.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4.h    |    4 +
 fs/ext4/mballoc.c |  207 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ext4/super.c   |    1 +
 3 files changed, 202 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf..e25f672 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(unsigned int, struct super_block *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
@@ -1682,6 +1684,8 @@ struct ext4_group_info {
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
 #endif
+	void		*bb_bitmap_deleted;
+	ext4_grpblk_t	bb_deleted;
 	struct rw_semaphore alloc_sem;
 	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
 					 * regions, index is order.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bde9d0b..fbc83fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2255,6 +2255,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_deleted = -1;
+
+
 
 #ifdef DOUBLE_CHECK
 	{
@@ -2469,6 +2472,7 @@ int ext4_mb_release(struct super_block *sb)
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
 #endif
+			kfree(grinfo->bb_bitmap_deleted);
 			ext4_lock_group(sb, i);
 			ext4_mb_cleanup_pa(grinfo);
 			ext4_unlock_group(sb, i);
@@ -2528,6 +2532,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	int err, count = 0, count2 = 0;
 	struct ext4_free_data *entry;
 	struct list_head *l, *ltmp;
+	void *bitmap_deleted = NULL;
 
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
 		entry = list_entry(l, struct ext4_free_data, list);
@@ -2543,6 +2548,14 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		/* there are blocks to put in buddy to make them really free */
 		count += entry->count;
 		count2++;
+
+		if (test_opt(sb, DISCARD) &&
+			(db->bb_bitmap_deleted == NULL) &&
+			(db->bb_deleted >= 0)) {
+			bitmap_deleted =
+				kmalloc(sb->s_blocksize, GFP_KERNEL);
+		}
+
 		ext4_lock_group(sb, entry->group);
 		/* Take it out of per group rb tree */
 		rb_erase(&entry->node, &(db->bb_free_root));
@@ -2555,17 +2568,24 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			page_cache_release(e4b.bd_buddy_page);
 			page_cache_release(e4b.bd_bitmap_page);
 		}
-		ext4_unlock_group(sb, entry->group);
-		if (test_opt(sb, DISCARD)) {
-			ext4_fsblk_t discard_block;
-
-			discard_block = entry->start_blk +
-				ext4_group_first_block_no(sb, entry->group);
-			trace_ext4_discard_blocks(sb,
-					(unsigned long long)discard_block,
-					entry->count);
-			sb_issue_discard(sb, discard_block, entry->count);
+		if (test_opt(sb, DISCARD) && (db->bb_deleted >= 0)) {
+			if (db->bb_bitmap_deleted == NULL) {
+				db->bb_bitmap_deleted = bitmap_deleted;
+				BUG_ON(db->bb_bitmap_deleted == NULL);
+
+				bitmap_deleted = NULL;
+				mb_clear_bits(db->bb_bitmap_deleted,
+					0, EXT4_BLOCKS_PER_GROUP(sb));
+			}
+
+			db->bb_deleted += entry->count;
+			mb_set_bits(db->bb_bitmap_deleted, entry->start_blk,
+				 entry->count);
 		}
+		ext4_unlock_group(sb, entry->group);
+
+		kfree(bitmap_deleted);
+
 		kmem_cache_free(ext4_free_ext_cachep, entry);
 		ext4_mb_release_desc(&e4b);
 	}
@@ -4639,3 +4659,170 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/* Trim "count" blocks starting at "start" in "group"
+ * This must be called under group lock
+ */
+void ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_free_extent ex;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	sb_issue_discard(sb, discard_block, count);
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+}
+
+/* Trim all free blocks, which have at least minlen length */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t start, next, count = 0;
+	ext4_group_t group;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = 0;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+
+			count += next - start;
+			ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+
+		}
+		start = next + 1;
+	}
+
+	e4b->bd_info->bb_deleted = 0;
+	ext4_unlock_group(sb, group);
+
+	return count;
+}
+
+/* Trim only blocks which is free and in bb_bitmap_deleted */
+ext4_grpblk_t ext4_trim_deleted(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	ext4_grpblk_t max, next, count = 0, start = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	grp = ext4_get_group_info(sb, group);
+
+	if (grp->bb_deleted < minblocks)
+		return 0;
+
+	ext4_lock_group(sb, group);
+
+	while (start < EXT4_BLOCKS_PER_GROUP(sb)) {
+		start = mb_find_next_bit(grp->bb_bitmap_deleted,
+			EXT4_BLOCKS_PER_GROUP(sb), start);
+		max = mb_find_next_zero_bit(grp->bb_bitmap_deleted,
+			EXT4_BLOCKS_PER_GROUP(sb), start);
+
+		while (start < max) {
+			start = mb_find_next_zero_bit(bitmap, max, start);
+			if (start >= max)
+				break;
+			next = mb_find_next_bit(bitmap, max, start);
+			if (next > max)
+				next = max;
+
+			if ((next - start) >= minblocks) {
+				count += next - start;
+
+				ext4_trim_extent(sb, start,
+					next - start, group, e4b);
+
+				mb_clear_bits(grp->bb_bitmap_deleted,
+					start, next - start);
+			}
+			start = next + 1;
+		}
+	}
+	grp->bb_deleted -= count;
+
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	return count;
+}
+
+int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
+{
+	struct ext4_buddy e4b;
+	struct ext4_group_info *grp;
+	ext4_group_t group;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t minblocks;
+
+	if (!test_opt(sb, DISCARD))
+		return 0;
+
+	minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
+	if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	for (group = 0; group < ngroups; group++) {
+		int err;
+
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			continue;
+		}
+		grp = ext4_get_group_info(sb, group);
+
+		if (grp->bb_deleted < 0) {
+			/* First run after mount */
+			ext4_trim_all_free(sb, &e4b, minblocks);
+		} else if (grp->bb_deleted >= minblocks) {
+			/* Trim only blocks deleted since first run */
+			ext4_trim_deleted(sb, &e4b, minblocks);
+		}
+
+		ext4_mb_release_desc(&e4b);
+	}
+
+	return 0;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c..253eb98 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.6.6.1


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-19 10:55   ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
@ 2010-04-20 21:21     ` Greg Freemyer
  2010-04-21  2:26       ` Mark Lord
  0 siblings, 1 reply; 53+ messages in thread
From: Greg Freemyer @ 2010-04-20 21:21 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, Jeff Moyer, Edward Shishkin, Eric Sandeen,
	Ric Wheeler, Mark Lord

Mark,

This is the patch implementing the new discard logic.

You did the benchmarking last year, but I thought you found calling
trim one contiguous sector range at a time was too inefficient.

See my highlight below:

On Mon, Apr 19, 2010 at 6:55 AM, Lukas Czerner <lczerner@redhat.com> wrote:
> Create an ioctl which walks through all the free extents in each
> allocating group and discard those extents. As an addition to improve
> its performance one can specify minimum free extent length, so ioctl
> will not bother with shorter extents.
>
> This of course means, that with each invocation the ioctl must walk
> through whole file system, checking and discarding free extents, which
> is not very efficient. The best way to avoid this is to keep track of
> deleted (freed) blocks. Then the ioctl have to trim just those free
> extents which were recently freed.
>
> In order to implement this I have added new bitmap into ext4_group_info
> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
> walk through bb_bitmap_deleted, compare deleted extents with free
> extents trim them and then removes it from the bb_bitmap_deleted.
>
> But you may notice, that there is one problem. bb_bitmap_deleted does
> not survive umount. To bypass the problem the first ioctl call have to
> walk through whole file system trimming all free extents.
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ---
>  fs/ext4/ext4.h    |    4 +
>  fs/ext4/mballoc.c |  207 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  fs/ext4/super.c   |    1 +
>  3 files changed, 202 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bf938cf..e25f672 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
>  extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
>  extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
>                                                ext4_group_t, int);
> +extern int ext4_trim_fs(unsigned int, struct super_block *);
> +
>  /* inode.c */
>  struct buffer_head *ext4_getblk(handle_t *, struct inode *,
>                                                ext4_lblk_t, int, int *);
> @@ -1682,6 +1684,8 @@ struct ext4_group_info {
>  #ifdef DOUBLE_CHECK
>        void            *bb_bitmap;
>  #endif
> +       void            *bb_bitmap_deleted;
> +       ext4_grpblk_t   bb_deleted;
>        struct rw_semaphore alloc_sem;
>        ext4_grpblk_t   bb_counters[];  /* Nr of free power-of-two-block
>                                         * regions, index is order.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index bde9d0b..fbc83fe 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2255,6 +2255,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
>        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
>        init_rwsem(&meta_group_info[i]->alloc_sem);
>        meta_group_info[i]->bb_free_root = RB_ROOT;
> +       meta_group_info[i]->bb_deleted = -1;
> +
> +
>
>  #ifdef DOUBLE_CHECK
>        {
> @@ -2469,6 +2472,7 @@ int ext4_mb_release(struct super_block *sb)
>  #ifdef DOUBLE_CHECK
>                        kfree(grinfo->bb_bitmap);
>  #endif
> +                       kfree(grinfo->bb_bitmap_deleted);
>                        ext4_lock_group(sb, i);
>                        ext4_mb_cleanup_pa(grinfo);
>                        ext4_unlock_group(sb, i);
> @@ -2528,6 +2532,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>        int err, count = 0, count2 = 0;
>        struct ext4_free_data *entry;
>        struct list_head *l, *ltmp;
> +       void *bitmap_deleted = NULL;
>
>        list_for_each_safe(l, ltmp, &txn->t_private_list) {
>                entry = list_entry(l, struct ext4_free_data, list);
> @@ -2543,6 +2548,14 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>                /* there are blocks to put in buddy to make them really free */
>                count += entry->count;
>                count2++;
> +
> +               if (test_opt(sb, DISCARD) &&
> +                       (db->bb_bitmap_deleted == NULL) &&
> +                       (db->bb_deleted >= 0)) {
> +                       bitmap_deleted =
> +                               kmalloc(sb->s_blocksize, GFP_KERNEL);
> +               }
> +
>                ext4_lock_group(sb, entry->group);
>                /* Take it out of per group rb tree */
>                rb_erase(&entry->node, &(db->bb_free_root));
> @@ -2555,17 +2568,24 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>                        page_cache_release(e4b.bd_buddy_page);
>                        page_cache_release(e4b.bd_bitmap_page);
>                }
> -               ext4_unlock_group(sb, entry->group);
> -               if (test_opt(sb, DISCARD)) {
> -                       ext4_fsblk_t discard_block;
> -
> -                       discard_block = entry->start_blk +
> -                               ext4_group_first_block_no(sb, entry->group);
> -                       trace_ext4_discard_blocks(sb,
> -                                       (unsigned long long)discard_block,
> -                                       entry->count);
> -                       sb_issue_discard(sb, discard_block, entry->count);
> +               if (test_opt(sb, DISCARD) && (db->bb_deleted >= 0)) {
> +                       if (db->bb_bitmap_deleted == NULL) {
> +                               db->bb_bitmap_deleted = bitmap_deleted;
> +                               BUG_ON(db->bb_bitmap_deleted == NULL);
> +
> +                               bitmap_deleted = NULL;
> +                               mb_clear_bits(db->bb_bitmap_deleted,
> +                                       0, EXT4_BLOCKS_PER_GROUP(sb));
> +                       }
> +
> +                       db->bb_deleted += entry->count;
> +                       mb_set_bits(db->bb_bitmap_deleted, entry->start_blk,
> +                                entry->count);
>                }
> +               ext4_unlock_group(sb, entry->group);
> +
> +               kfree(bitmap_deleted);
> +
>                kmem_cache_free(ext4_free_ext_cachep, entry);
>                ext4_mb_release_desc(&e4b);
>        }
> @@ -4639,3 +4659,170 @@ error_return:
>                kmem_cache_free(ext4_ac_cachep, ac);
>        return;
>  }
> +
> +/* Trim "count" blocks starting at "start" in "group"
> + * This must be called under group lock
> + */
> +void ext4_trim_extent(struct super_block *sb, int start, int count,
> +               ext4_group_t group, struct ext4_buddy *e4b)
> +{
> +       ext4_fsblk_t discard_block;
> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +       struct ext4_free_extent ex;
> +
> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
> +
> +       ex.fe_start = start;
> +       ex.fe_group = group;
> +       ex.fe_len = count;
> +
> +       mb_mark_used(e4b, &ex);
> +       ext4_unlock_group(sb, group);
> +
> +       discard_block = (ext4_fsblk_t)group *
> +                       EXT4_BLOCKS_PER_GROUP(sb)
> +                       + start
> +                       + le32_to_cpu(es->s_first_data_block);
> +       trace_ext4_discard_blocks(sb,
> +                       (unsigned long long)discard_block,
> +                       count);
> +       sb_issue_discard(sb, discard_block, count);
> +
> +       ext4_lock_group(sb, group);
> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
> +}

Mark, unless I'm missing something, sb_issue_discard() above is going
to trigger a trim command for just the one range.  I thought the
benchmarks you did showed that a collection of ranges needed to be
built, then a single trim command invoked that trimmed that group of
ranges.

Greg

> +
> +/* Trim all free blocks, which have at least minlen length */
> +ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
> +               ext4_grpblk_t minblocks)
> +{
> +       void *bitmap;
> +       ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
> +       ext4_grpblk_t start, next, count = 0;
> +       ext4_group_t group;
> +
> +       BUG_ON(e4b == NULL);
> +
> +       bitmap = e4b->bd_bitmap;
> +       group = e4b->bd_group;
> +       start = 0;
> +       ext4_lock_group(sb, group);
> +
> +       while (start < max) {
> +
> +               start = mb_find_next_zero_bit(bitmap, max, start);
> +               if (start >= max)
> +                       break;
> +               next = mb_find_next_bit(bitmap, max, start);
> +
> +               if ((next - start) >= minblocks) {
> +
> +                       count += next - start;
> +                       ext4_trim_extent(sb, start,
> +                               next - start, group, e4b);
> +
> +               }
> +               start = next + 1;
> +       }
> +
> +       e4b->bd_info->bb_deleted = 0;
> +       ext4_unlock_group(sb, group);
> +
> +       return count;
> +}
> +
> +/* Trim only blocks which is free and in bb_bitmap_deleted */
> +ext4_grpblk_t ext4_trim_deleted(struct super_block *sb, struct ext4_buddy *e4b,
> +               ext4_grpblk_t minblocks)
> +{
> +       void *bitmap;
> +       struct ext4_group_info *grp;
> +       ext4_group_t group;
> +       ext4_grpblk_t max, next, count = 0, start = 0;
> +
> +       BUG_ON(e4b == NULL);
> +
> +       bitmap = e4b->bd_bitmap;
> +       group = e4b->bd_group;
> +       grp = ext4_get_group_info(sb, group);
> +
> +       if (grp->bb_deleted < minblocks)
> +               return 0;
> +
> +       ext4_lock_group(sb, group);
> +
> +       while (start < EXT4_BLOCKS_PER_GROUP(sb)) {
> +               start = mb_find_next_bit(grp->bb_bitmap_deleted,
> +                       EXT4_BLOCKS_PER_GROUP(sb), start);
> +               max = mb_find_next_zero_bit(grp->bb_bitmap_deleted,
> +                       EXT4_BLOCKS_PER_GROUP(sb), start);
> +
> +               while (start < max) {
> +                       start = mb_find_next_zero_bit(bitmap, max, start);
> +                       if (start >= max)
> +                               break;
> +                       next = mb_find_next_bit(bitmap, max, start);
> +                       if (next > max)
> +                               next = max;
> +
> +                       if ((next - start) >= minblocks) {
> +                               count += next - start;
> +
> +                               ext4_trim_extent(sb, start,
> +                                       next - start, group, e4b);
> +
> +                               mb_clear_bits(grp->bb_bitmap_deleted,
> +                                       start, next - start);
> +                       }
> +                       start = next + 1;
> +               }
> +       }
> +       grp->bb_deleted -= count;
> +
> +       ext4_unlock_group(sb, group);
> +
> +       ext4_debug("trimmed %d blocks in the group %d\n",
> +               count, group);
> +
> +       return count;
> +}
> +
> +int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
> +{
> +       struct ext4_buddy e4b;
> +       struct ext4_group_info *grp;
> +       ext4_group_t group;
> +       ext4_group_t ngroups = ext4_get_groups_count(sb);
> +       ext4_grpblk_t minblocks;
> +
> +       if (!test_opt(sb, DISCARD))
> +               return 0;
> +
> +       minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
> +       if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
> +               return -EINVAL;
> +
> +       for (group = 0; group < ngroups; group++) {
> +               int err;
> +
> +               err = ext4_mb_load_buddy(sb, group, &e4b);
> +               if (err) {
> +                       ext4_error(sb, "Error in loading buddy "
> +                                       "information for %u", group);
> +                       continue;
> +               }
> +               grp = ext4_get_group_info(sb, group);
> +
> +               if (grp->bb_deleted < 0) {
> +                       /* First run after mount */
> +                       ext4_trim_all_free(sb, &e4b, minblocks);
> +               } else if (grp->bb_deleted >= minblocks) {
> +                       /* Trim only blocks deleted since first run */
> +                       ext4_trim_deleted(sb, &e4b, minblocks);
> +               }
> +
> +               ext4_mb_release_desc(&e4b);
> +       }
> +
> +       return 0;
> +}
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index e14d22c..253eb98 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
>        .quota_write    = ext4_quota_write,
>  #endif
>        .bdev_try_to_free_page = bdev_try_to_free_page,
> +       .trim_fs        = ext4_trim_fs
>  };
>
>  static const struct super_operations ext4_nojournal_sops = {
> --
> 1.6.6.1
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-20 21:21     ` Greg Freemyer
@ 2010-04-21  2:26       ` Mark Lord
  2010-04-21  2:45         ` Eric Sandeen
  0 siblings, 1 reply; 53+ messages in thread
From: Mark Lord @ 2010-04-21  2:26 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Lukas Czerner, linux-ext4, Jeff Moyer, Edward Shishkin,
	Eric Sandeen, Ric Wheeler

On 20/04/10 05:21 PM, Greg Freemyer wrote:
> Mark,
>
> This is the patch implementing the new discard logic.
..
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
..
>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>> +               ext4_group_t group, struct ext4_buddy *e4b)
>> +{
>> +       ext4_fsblk_t discard_block;
>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>> +       struct ext4_free_extent ex;
>> +
>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>> +
>> +       ex.fe_start = start;
>> +       ex.fe_group = group;
>> +       ex.fe_len = count;
>> +
>> +       mb_mark_used(e4b,&ex);
>> +       ext4_unlock_group(sb, group);
>> +
>> +       discard_block = (ext4_fsblk_t)group *
>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>> +                       + start
>> +                       + le32_to_cpu(es->s_first_data_block);
>> +       trace_ext4_discard_blocks(sb,
>> +                       (unsigned long long)discard_block,
>> +                       count);
>> +       sb_issue_discard(sb, discard_block, count);
>> +
>> +       ext4_lock_group(sb, group);
>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>> +}
>
> Mark, unless I'm missing something, sb_issue_discard() above is going
> to trigger a trim command for just the one range.  I thought the
> benchmarks you did showed that a collection of ranges needed to be
> built, then a single trim command invoked that trimmed that group of
> ranges.
..

Mmm.. If that's what it is doing, then this patch set would be a complete disaster.
It would take *hours* to do the initial TRIM.

Lukas ?

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21  2:26       ` Mark Lord
@ 2010-04-21  2:45         ` Eric Sandeen
  2010-04-21 18:59           ` Greg Freemyer
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Sandeen @ 2010-04-21  2:45 UTC (permalink / raw)
  To: Mark Lord
  Cc: Greg Freemyer, Lukas Czerner, linux-ext4, Jeff Moyer,
	Edward Shishkin, Eric Sandeen, Ric Wheeler

Mark Lord wrote:
> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>> Mark,
>>
>> This is the patch implementing the new discard logic.
> ..
>> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ..
>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>> +{
>>> +       ext4_fsblk_t discard_block;
>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>> +       struct ext4_free_extent ex;
>>> +
>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>> +
>>> +       ex.fe_start = start;
>>> +       ex.fe_group = group;
>>> +       ex.fe_len = count;
>>> +
>>> +       mb_mark_used(e4b,&ex);
>>> +       ext4_unlock_group(sb, group);
>>> +
>>> +       discard_block = (ext4_fsblk_t)group *
>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>> +                       + start
>>> +                       + le32_to_cpu(es->s_first_data_block);
>>> +       trace_ext4_discard_blocks(sb,
>>> +                       (unsigned long long)discard_block,
>>> +                       count);
>>> +       sb_issue_discard(sb, discard_block, count);
>>> +
>>> +       ext4_lock_group(sb, group);
>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>> +}
>>
>> Mark, unless I'm missing something, sb_issue_discard() above is going
>> to trigger a trim command for just the one range.  I thought the
>> benchmarks you did showed that a collection of ranges needed to be
>> built, then a single trim command invoked that trimmed that group of
>> ranges.
> ..
> 
> Mmm.. If that's what it is doing, then this patch set would be a
> complete disaster.
> It would take *hours* to do the initial TRIM.
> 
> Lukas ?

I'm confused; do we have an interface to send a trim command for multiple ranges?

I didn't think so ...  Lukas' patch is finding free ranges (above a size threshold)
to discard; it's not doing it a block at a time, if that's the concern.

-Eric

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21  2:45         ` Eric Sandeen
@ 2010-04-21 18:59           ` Greg Freemyer
  2010-04-21 19:04             ` Ric Wheeler
  0 siblings, 1 reply; 53+ messages in thread
From: Greg Freemyer @ 2010-04-21 18:59 UTC (permalink / raw)
  To: Eric Sandeen
  Cc: Mark Lord, Lukas Czerner, linux-ext4, Jeff Moyer, Edward Shishkin,
	Eric Sandeen, Ric Wheeler

On Tue, Apr 20, 2010 at 10:45 PM, Eric Sandeen <sandeen@redhat.com> wrote:
> Mark Lord wrote:
>> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>>> Mark,
>>>
>>> This is the patch implementing the new discard logic.
>> ..
>>> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
>> ..
>>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>>> +{
>>>> +       ext4_fsblk_t discard_block;
>>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>>> +       struct ext4_free_extent ex;
>>>> +
>>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>>> +
>>>> +       ex.fe_start = start;
>>>> +       ex.fe_group = group;
>>>> +       ex.fe_len = count;
>>>> +
>>>> +       mb_mark_used(e4b,&ex);
>>>> +       ext4_unlock_group(sb, group);
>>>> +
>>>> +       discard_block = (ext4_fsblk_t)group *
>>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>>> +                       + start
>>>> +                       + le32_to_cpu(es->s_first_data_block);
>>>> +       trace_ext4_discard_blocks(sb,
>>>> +                       (unsigned long long)discard_block,
>>>> +                       count);
>>>> +       sb_issue_discard(sb, discard_block, count);
>>>> +
>>>> +       ext4_lock_group(sb, group);
>>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>>> +}
>>>
>>> Mark, unless I'm missing something, sb_issue_discard() above is going
>>> to trigger a trim command for just the one range.  I thought the
>>> benchmarks you did showed that a collection of ranges needed to be
>>> built, then a single trim command invoked that trimmed that group of
>>> ranges.
>> ..
>>
>> Mmm.. If that's what it is doing, then this patch set would be a
>> complete disaster.
>> It would take *hours* to do the initial TRIM.
>>
>> Lukas ?
>
> I'm confused; do we have an interface to send a trim command for multiple ranges?
>
> I didn't think so ...  Lukas' patch is finding free ranges (above a size threshold)
> to discard; it's not doing it a block at a time, if that's the concern.
>
> -Eric

Eric,

I don't know what kernel APIs have been created to support discard,
but the ATA8 draft spec. allows for specifying multiple ranges in one
trim command.

See section 7.10.3.1 and .2 of the latest draft spec.

Both talk about multiple trim ranges per trim command (think thousands
of ranges per command).

Recent hdparm versions accept a trim command argument that causes
multiple ranges to be trimmed per command.

 --trim-sector-ranges        Tell SSD firmware to discard unneeded
data sectors: lba:count ..
 --trim-sector-ranges-stdin  Same as above, but reads lba:count pairs from stdin

As I understand it, this is critical from a performance perspective
for the SSDs Mark tested with.  ie. He found a single trim command
with 1000 ranges takes much less time than 1000 discrete trim
commands.

Per Mark's comment's in wiper.sh, a trim command can have a minimum of
128KB of associated range information, so it is thousands of ranges
that can be discarded in a single command

ie. hdparm can accept extremely large lists of ranges on stdin, but it
parses the list into discrete trim commands with thousands of ranges
per command.

A kernel implementation which is trying to implement after that fact
discards as this patch is doing, also needs to somehow craft trim
commands with a large payload of ranges if it is going to be
efficient.

If the block layer cannot do this yet, then in my opinion this type of
batched discarding needs to stay in user space as done with Mark's
wiper.sh script and enhanced hdparm until the block layer grows that
ability.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 18:59           ` Greg Freemyer
@ 2010-04-21 19:04             ` Ric Wheeler
  2010-04-21 19:22               ` Jeff Moyer
  0 siblings, 1 reply; 53+ messages in thread
From: Ric Wheeler @ 2010-04-21 19:04 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Eric Sandeen, Mark Lord, Lukas Czerner, linux-ext4, Jeff Moyer,
	Edward Shishkin, Eric Sandeen

On 04/21/2010 02:59 PM, Greg Freemyer wrote:
> On Tue, Apr 20, 2010 at 10:45 PM, Eric Sandeen<sandeen@redhat.com>  wrote:
>> Mark Lord wrote:
>>> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>>>> Mark,
>>>>
>>>> This is the patch implementing the new discard logic.
>>> ..
>>>> Signed-off-by: Lukas Czerner<lczerner@redhat.com>
>>> ..
>>>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>>>> +{
>>>>> +       ext4_fsblk_t discard_block;
>>>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>>>> +       struct ext4_free_extent ex;
>>>>> +
>>>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>>>> +
>>>>> +       ex.fe_start = start;
>>>>> +       ex.fe_group = group;
>>>>> +       ex.fe_len = count;
>>>>> +
>>>>> +       mb_mark_used(e4b,&ex);
>>>>> +       ext4_unlock_group(sb, group);
>>>>> +
>>>>> +       discard_block = (ext4_fsblk_t)group *
>>>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>>>> +                       + start
>>>>> +                       + le32_to_cpu(es->s_first_data_block);
>>>>> +       trace_ext4_discard_blocks(sb,
>>>>> +                       (unsigned long long)discard_block,
>>>>> +                       count);
>>>>> +       sb_issue_discard(sb, discard_block, count);
>>>>> +
>>>>> +       ext4_lock_group(sb, group);
>>>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>>>> +}
>>>>
>>>> Mark, unless I'm missing something, sb_issue_discard() above is going
>>>> to trigger a trim command for just the one range.  I thought the
>>>> benchmarks you did showed that a collection of ranges needed to be
>>>> built, then a single trim command invoked that trimmed that group of
>>>> ranges.
>>> ..
>>>
>>> Mmm.. If that's what it is doing, then this patch set would be a
>>> complete disaster.
>>> It would take *hours* to do the initial TRIM.
>>>
>>> Lukas ?
>>
>> I'm confused; do we have an interface to send a trim command for multiple ranges?
>>
>> I didn't think so ...  Lukas' patch is finding free ranges (above a size threshold)
>> to discard; it's not doing it a block at a time, if that's the concern.
>>
>> -Eric
>
> Eric,
>
> I don't know what kernel APIs have been created to support discard,
> but the ATA8 draft spec. allows for specifying multiple ranges in one
> trim command.
>

Greg,

We have full support for this in the "discard" support at the file system layer 
for several file systems.

The block layer effectively muxes the "discard" into the right target device 
command. TRIM for ATA, WRITE_SAME (with unmap) or UNMAP for SCSI...

If your favourite fs supports this, you can enable this feature with "-o 
discard" for fine grained discards,

ric

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 19:04             ` Ric Wheeler
@ 2010-04-21 19:22               ` Jeff Moyer
  2010-04-21 20:44                 ` Greg Freemyer
  2010-04-21 20:52                 ` Greg Freemyer
  0 siblings, 2 replies; 53+ messages in thread
From: Jeff Moyer @ 2010-04-21 19:22 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Greg Freemyer, Eric Sandeen, Mark Lord, Lukas Czerner, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

Ric Wheeler <rwheeler@redhat.com> writes:

> On 04/21/2010 02:59 PM, Greg Freemyer wrote:
>> On Tue, Apr 20, 2010 at 10:45 PM, Eric Sandeen<sandeen@redhat.com>  wrote:
>>> Mark Lord wrote:
>>>> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>>>>> Mark,
>>>>>
>>>>> This is the patch implementing the new discard logic.
>>>> ..
>>>>> Signed-off-by: Lukas Czerner<lczerner@redhat.com>
>>>> ..
>>>>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>>>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>>>>> +{
>>>>>> +       ext4_fsblk_t discard_block;
>>>>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>>>>> +       struct ext4_free_extent ex;
>>>>>> +
>>>>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>>>>> +
>>>>>> +       ex.fe_start = start;
>>>>>> +       ex.fe_group = group;
>>>>>> +       ex.fe_len = count;
>>>>>> +
>>>>>> +       mb_mark_used(e4b,&ex);
>>>>>> +       ext4_unlock_group(sb, group);
>>>>>> +
>>>>>> +       discard_block = (ext4_fsblk_t)group *
>>>>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>>>>> +                       + start
>>>>>> +                       + le32_to_cpu(es->s_first_data_block);
>>>>>> +       trace_ext4_discard_blocks(sb,
>>>>>> +                       (unsigned long long)discard_block,
>>>>>> +                       count);
>>>>>> +       sb_issue_discard(sb, discard_block, count);
>>>>>> +
>>>>>> +       ext4_lock_group(sb, group);
>>>>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>>>>> +}
>>>>>
>>>>> Mark, unless I'm missing something, sb_issue_discard() above is going
>>>>> to trigger a trim command for just the one range.  I thought the
>>>>> benchmarks you did showed that a collection of ranges needed to be
>>>>> built, then a single trim command invoked that trimmed that group of
>>>>> ranges.
>>>> ..
>>>>
>>>> Mmm.. If that's what it is doing, then this patch set would be a
>>>> complete disaster.
>>>> It would take *hours* to do the initial TRIM.

Except it doesn't.  Lukas did provide numbers in his original email.

>>>> Lukas ?
>>>
>>> I'm confused; do we have an interface to send a trim command for multiple ranges?
>>>
>>> I didn't think so ...  Lukas' patch is finding free ranges (above a size threshold)
>>> to discard; it's not doing it a block at a time, if that's the concern.
>>>
>>> -Eric
>>
>> Eric,
>>
>> I don't know what kernel APIs have been created to support discard,
>> but the ATA8 draft spec. allows for specifying multiple ranges in one
>> trim command.

Well, sb_issue_discard is what ext4 is using, and that takes a single
range.  I don't know if anyone has looked into adding a vectored API.

>
> Greg,
>
> We have full support for this in the "discard" support at the file
> system layer for several file systems.

Actually, we don't support what Greg is talking about, to my knowledge.

> The block layer effectively muxes the "discard" into the right target
> device command. TRIM for ATA, WRITE_SAME (with unmap) or UNMAP for
> SCSI...
>
> If your favourite fs supports this, you can enable this feature with
> "-o 
> discard" for fine grained discards,

Thanks, it's worth pointing out that TRIM is not the only backend to the
discard API.  However, even if we do implement a vectored API, we can
translate that to dumber commands if a given spec doesn't support it.

Getting back to the problem...

>From the file system, you want to discard discrete ranges of blocks.
The API to support this can either take care of the data integrity
guarantees by itself, or make the upper layer ensure that trim and write
do not pass each other.  The current implementation does the latter.  In
order to do the former, there is the potential for a lot of overhead to
be introduced into the block allocation layers for the file systems.

So, given the above, it is up to the file system to send down the
biggest discard requests it can in order to reduce the overhead of the
command.  If a vectored approach is made available, then that would be
even better.  Christoph, is this something that's on your radar?

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 19:22               ` Jeff Moyer
@ 2010-04-21 20:44                 ` Greg Freemyer
  2010-04-21 20:53                   ` Greg Freemyer
                                     ` (2 more replies)
  2010-04-21 20:52                 ` Greg Freemyer
  1 sibling, 3 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-21 20:44 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Ric Wheeler, Eric Sandeen, Mark Lord, Lukas Czerner, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On Wed, Apr 21, 2010 at 3:22 PM, Jeff Moyer <jmoyer@redhat.com> wrote:
> Ric Wheeler <rwheeler@redhat.com> writes:
>
>> On 04/21/2010 02:59 PM, Greg Freemyer wrote:
>>> On Tue, Apr 20, 2010 at 10:45 PM, Eric Sandeen<sandeen@redhat.com>  wrote:
>>>> Mark Lord wrote:
>>>>> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>>>>>> Mark,
>>>>>>
>>>>>> This is the patch implementing the new discard logic.
>>>>> ..
>>>>>> Signed-off-by: Lukas Czerner<lczerner@redhat.com>
>>>>> ..
>>>>>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>>>>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>>>>>> +{
>>>>>>> +       ext4_fsblk_t discard_block;
>>>>>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>>>>>> +       struct ext4_free_extent ex;
>>>>>>> +
>>>>>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>>>>>> +
>>>>>>> +       ex.fe_start = start;
>>>>>>> +       ex.fe_group = group;
>>>>>>> +       ex.fe_len = count;
>>>>>>> +
>>>>>>> +       mb_mark_used(e4b,&ex);
>>>>>>> +       ext4_unlock_group(sb, group);
>>>>>>> +
>>>>>>> +       discard_block = (ext4_fsblk_t)group *
>>>>>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>>>>>> +                       + start
>>>>>>> +                       + le32_to_cpu(es->s_first_data_block);
>>>>>>> +       trace_ext4_discard_blocks(sb,
>>>>>>> +                       (unsigned long long)discard_block,
>>>>>>> +                       count);
>>>>>>> +       sb_issue_discard(sb, discard_block, count);
>>>>>>> +
>>>>>>> +       ext4_lock_group(sb, group);
>>>>>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>>>>>> +}
>>>>>>
>>>>>> Mark, unless I'm missing something, sb_issue_discard() above is going
>>>>>> to trigger a trim command for just the one range.  I thought the
>>>>>> benchmarks you did showed that a collection of ranges needed to be
>>>>>> built, then a single trim command invoked that trimmed that group of
>>>>>> ranges.
>>>>> ..
>>>>>
>>>>> Mmm.. If that's what it is doing, then this patch set would be a
>>>>> complete disaster.
>>>>> It would take *hours* to do the initial TRIM.
>
> Except it doesn't.  Lukas did provide numbers in his original email.
>

Looking at the benchmarks (for the first time) at
http://people.redhat.com/jmoyer/discard/ext4_batched_discard/

I don't see anything that says how long the proposed trim ioctl takes
to complete on the full filesystem.

What they do show is that with the 3 test SSDs used for this
benchmark, the current released discard implementation is a net loss.
ie. You are better off running without the discards for all 3 vendors.
 (at least under the conditions tested.)

After the patch is applied and optimizing the discards to large free
extents only, it works out to same performance with or without the
discards.  ie. no net gain or loss.

That is extremely cool because one assumes that the non-discard case
would degrade over time, but that the discard case will not.

So that argues for the current proposed patch going in.

But quoting from the first email:

==
The basic idea behind my discard support is to create an ioctl which
walks through all the free extents in each allocating group and discard
those extents. As an addition to improve its performance one can specify
minimum free extent length, so ioctl will not bother with shorter extents.

This of course means, that with each invocation the ioctl must walk
through whole file system, checking and discarding free extents, which
is not very efficient. The best way to avoid this is to keep track of
deleted (freed) blocks. Then the ioctl have to trim just those free
extents which were recently freed.

In order to implement this I have added new bitmap into ext4_group_info
(bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
walk through bb_bitmap_deleted, compare deleted extents with free
extents trim them and then removes it from the bb_bitmap_deleted.

But you may notice, that there is one problem. bb_bitmap_deleted does
not survive umount. To bypass the problem the first ioctl call have to
walk through whole file system trimming all free extents. But there is a
better solution to this problem. The bb_bitmap_deleted can be stored on
disk an can be restored in mount time along with other bitmaps, but I
think it is a quite big change and should be discussed further.
==

The above seems to argue against the patch going in until the
mount/umount issues are addressed.

So in addition to this patch, Lukas is proposing a on disk change to
address the fact that calling trim upteen times at mount time is too
slow.

Per Mark's testing of last summer, an alternative solution is to use a
vectored trim approach that is far more efficient.

Mark's benchmarks showed this as doable in seconds which seems like a
reasonable amount of time for a mount time operation.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 19:22               ` Jeff Moyer
  2010-04-21 20:44                 ` Greg Freemyer
@ 2010-04-21 20:52                 ` Greg Freemyer
  1 sibling, 0 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-21 20:52 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Ric Wheeler, Eric Sandeen, Mark Lord, Lukas Czerner, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

correcting Christoph's email address - no other edits/comments

On Wed, Apr 21, 2010 at 3:22 PM, Jeff Moyer <jmoyer@redhat.com> wrote:
> Ric Wheeler <rwheeler@redhat.com> writes:
>
>> On 04/21/2010 02:59 PM, Greg Freemyer wrote:
>>> On Tue, Apr 20, 2010 at 10:45 PM, Eric Sandeen<sandeen@redhat.com>  wrote:
>>>> Mark Lord wrote:
>>>>> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>>>>>> Mark,
>>>>>>
>>>>>> This is the patch implementing the new discard logic.
>>>>> ..
>>>>>> Signed-off-by: Lukas Czerner<lczerner@redhat.com>
>>>>> ..
>>>>>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>>>>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>>>>>> +{
>>>>>>> +       ext4_fsblk_t discard_block;
>>>>>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>>>>>> +       struct ext4_free_extent ex;
>>>>>>> +
>>>>>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>>>>>> +
>>>>>>> +       ex.fe_start = start;
>>>>>>> +       ex.fe_group = group;
>>>>>>> +       ex.fe_len = count;
>>>>>>> +
>>>>>>> +       mb_mark_used(e4b,&ex);
>>>>>>> +       ext4_unlock_group(sb, group);
>>>>>>> +
>>>>>>> +       discard_block = (ext4_fsblk_t)group *
>>>>>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>>>>>> +                       + start
>>>>>>> +                       + le32_to_cpu(es->s_first_data_block);
>>>>>>> +       trace_ext4_discard_blocks(sb,
>>>>>>> +                       (unsigned long long)discard_block,
>>>>>>> +                       count);
>>>>>>> +       sb_issue_discard(sb, discard_block, count);
>>>>>>> +
>>>>>>> +       ext4_lock_group(sb, group);
>>>>>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>>>>>> +}
>>>>>>
>>>>>> Mark, unless I'm missing something, sb_issue_discard() above is going
>>>>>> to trigger a trim command for just the one range.  I thought the
>>>>>> benchmarks you did showed that a collection of ranges needed to be
>>>>>> built, then a single trim command invoked that trimmed that group of
>>>>>> ranges.
>>>>> ..
>>>>>
>>>>> Mmm.. If that's what it is doing, then this patch set would be a
>>>>> complete disaster.
>>>>> It would take *hours* to do the initial TRIM.
>
> Except it doesn't.  Lukas did provide numbers in his original email.
>
>>>>> Lukas ?
>>>>
>>>> I'm confused; do we have an interface to send a trim command for multiple ranges?
>>>>
>>>> I didn't think so ...  Lukas' patch is finding free ranges (above a size threshold)
>>>> to discard; it's not doing it a block at a time, if that's the concern.
>>>>
>>>> -Eric
>>>
>>> Eric,
>>>
>>> I don't know what kernel APIs have been created to support discard,
>>> but the ATA8 draft spec. allows for specifying multiple ranges in one
>>> trim command.
>
> Well, sb_issue_discard is what ext4 is using, and that takes a single
> range.  I don't know if anyone has looked into adding a vectored API.
>
>>
>> Greg,
>>
>> We have full support for this in the "discard" support at the file
>> system layer for several file systems.
>
> Actually, we don't support what Greg is talking about, to my knowledge.
>
>> The block layer effectively muxes the "discard" into the right target
>> device command. TRIM for ATA, WRITE_SAME (with unmap) or UNMAP for
>> SCSI...
>>
>> If your favourite fs supports this, you can enable this feature with
>> "-o
>> discard" for fine grained discards,
>
> Thanks, it's worth pointing out that TRIM is not the only backend to the
> discard API.  However, even if we do implement a vectored API, we can
> translate that to dumber commands if a given spec doesn't support it.
>
> Getting back to the problem...
>
> From the file system, you want to discard discrete ranges of blocks.
> The API to support this can either take care of the data integrity
> guarantees by itself, or make the upper layer ensure that trim and write
> do not pass each other.  The current implementation does the latter.  In
> order to do the former, there is the potential for a lot of overhead to
> be introduced into the block allocation layers for the file systems.
>
> So, given the above, it is up to the file system to send down the
> biggest discard requests it can in order to reduce the overhead of the
> command.  If a vectored approach is made available, then that would be
> even better.  Christoph, is this something that's on your radar?
>
> Cheers,
> Jeff
>



-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 20:44                 ` Greg Freemyer
@ 2010-04-21 20:53                   ` Greg Freemyer
  2010-04-21 21:01                   ` Eric Sandeen
  2010-04-23  8:23                   ` Lukas Czerner
  2 siblings, 0 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-21 20:53 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Ric Wheeler, Eric Sandeen, Mark Lord, Lukas Czerner, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

correcting Christoph's email address - no other edits/comments

On Wed, Apr 21, 2010 at 4:44 PM, Greg Freemyer <greg.freemyer@gmail.com> wrote:
> On Wed, Apr 21, 2010 at 3:22 PM, Jeff Moyer <jmoyer@redhat.com> wrote:
>> Ric Wheeler <rwheeler@redhat.com> writes:
>>
>>> On 04/21/2010 02:59 PM, Greg Freemyer wrote:
>>>> On Tue, Apr 20, 2010 at 10:45 PM, Eric Sandeen<sandeen@redhat.com>  wrote:
>>>>> Mark Lord wrote:
>>>>>> On 20/04/10 05:21 PM, Greg Freemyer wrote:
>>>>>>> Mark,
>>>>>>>
>>>>>>> This is the patch implementing the new discard logic.
>>>>>> ..
>>>>>>> Signed-off-by: Lukas Czerner<lczerner@redhat.com>
>>>>>> ..
>>>>>>>> +void ext4_trim_extent(struct super_block *sb, int start, int count,
>>>>>>>> +               ext4_group_t group, struct ext4_buddy *e4b)
>>>>>>>> +{
>>>>>>>> +       ext4_fsblk_t discard_block;
>>>>>>>> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
>>>>>>>> +       struct ext4_free_extent ex;
>>>>>>>> +
>>>>>>>> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
>>>>>>>> +
>>>>>>>> +       ex.fe_start = start;
>>>>>>>> +       ex.fe_group = group;
>>>>>>>> +       ex.fe_len = count;
>>>>>>>> +
>>>>>>>> +       mb_mark_used(e4b,&ex);
>>>>>>>> +       ext4_unlock_group(sb, group);
>>>>>>>> +
>>>>>>>> +       discard_block = (ext4_fsblk_t)group *
>>>>>>>> +                       EXT4_BLOCKS_PER_GROUP(sb)
>>>>>>>> +                       + start
>>>>>>>> +                       + le32_to_cpu(es->s_first_data_block);
>>>>>>>> +       trace_ext4_discard_blocks(sb,
>>>>>>>> +                       (unsigned long long)discard_block,
>>>>>>>> +                       count);
>>>>>>>> +       sb_issue_discard(sb, discard_block, count);
>>>>>>>> +
>>>>>>>> +       ext4_lock_group(sb, group);
>>>>>>>> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
>>>>>>>> +}
>>>>>>>
>>>>>>> Mark, unless I'm missing something, sb_issue_discard() above is going
>>>>>>> to trigger a trim command for just the one range.  I thought the
>>>>>>> benchmarks you did showed that a collection of ranges needed to be
>>>>>>> built, then a single trim command invoked that trimmed that group of
>>>>>>> ranges.
>>>>>> ..
>>>>>>
>>>>>> Mmm.. If that's what it is doing, then this patch set would be a
>>>>>> complete disaster.
>>>>>> It would take *hours* to do the initial TRIM.
>>
>> Except it doesn't.  Lukas did provide numbers in his original email.
>>
>
> Looking at the benchmarks (for the first time) at
> http://people.redhat.com/jmoyer/discard/ext4_batched_discard/
>
> I don't see anything that says how long the proposed trim ioctl takes
> to complete on the full filesystem.
>
> What they do show is that with the 3 test SSDs used for this
> benchmark, the current released discard implementation is a net loss.
> ie. You are better off running without the discards for all 3 vendors.
>  (at least under the conditions tested.)
>
> After the patch is applied and optimizing the discards to large free
> extents only, it works out to same performance with or without the
> discards.  ie. no net gain or loss.
>
> That is extremely cool because one assumes that the non-discard case
> would degrade over time, but that the discard case will not.
>
> So that argues for the current proposed patch going in.
>
> But quoting from the first email:
>
> ==
> The basic idea behind my discard support is to create an ioctl which
> walks through all the free extents in each allocating group and discard
> those extents. As an addition to improve its performance one can specify
> minimum free extent length, so ioctl will not bother with shorter extents.
>
> This of course means, that with each invocation the ioctl must walk
> through whole file system, checking and discarding free extents, which
> is not very efficient. The best way to avoid this is to keep track of
> deleted (freed) blocks. Then the ioctl have to trim just those free
> extents which were recently freed.
>
> In order to implement this I have added new bitmap into ext4_group_info
> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
> walk through bb_bitmap_deleted, compare deleted extents with free
> extents trim them and then removes it from the bb_bitmap_deleted.
>
> But you may notice, that there is one problem. bb_bitmap_deleted does
> not survive umount. To bypass the problem the first ioctl call have to
> walk through whole file system trimming all free extents. But there is a
> better solution to this problem. The bb_bitmap_deleted can be stored on
> disk an can be restored in mount time along with other bitmaps, but I
> think it is a quite big change and should be discussed further.
> ==
>
> The above seems to argue against the patch going in until the
> mount/umount issues are addressed.
>
> So in addition to this patch, Lukas is proposing a on disk change to
> address the fact that calling trim upteen times at mount time is too
> slow.
>
> Per Mark's testing of last summer, an alternative solution is to use a
> vectored trim approach that is far more efficient.
>
> Mark's benchmarks showed this as doable in seconds which seems like a
> reasonable amount of time for a mount time operation.
>
> Greg
>



-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 20:44                 ` Greg Freemyer
  2010-04-21 20:53                   ` Greg Freemyer
@ 2010-04-21 21:01                   ` Eric Sandeen
  2010-04-21 21:03                     ` Ric Wheeler
  2010-04-23  8:23                   ` Lukas Czerner
  2 siblings, 1 reply; 53+ messages in thread
From: Eric Sandeen @ 2010-04-21 21:01 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Jeff Moyer, Ric Wheeler, Eric Sandeen, Mark Lord, Lukas Czerner,
	linux-ext4, Edward Shishkin, Christoph Hellwig

On 04/21/2010 03:44 PM, Greg Freemyer wrote:

> Mark's benchmarks showed this as doable in seconds which seems like a
> reasonable amount of time for a mount time operation.

All the other things aside, mount-time is interesting, but it's an
infrequent operation, at least in my world.  I think we need something
that can be done runtime.

For anything with uptime, I don't think it's acceptable to wait until
the next mount to trim unused blocks.

But as long as the mechanism can be called either at mount time and/or
kicked off runtime somehow, I'm happy.

-Eric

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 21:01                   ` Eric Sandeen
@ 2010-04-21 21:03                     ` Ric Wheeler
  2010-04-21 21:47                       ` Greg Freemyer
  0 siblings, 1 reply; 53+ messages in thread
From: Ric Wheeler @ 2010-04-21 21:03 UTC (permalink / raw)
  To: sandeen
  Cc: Eric Sandeen, Greg Freemyer, Jeff Moyer, Mark Lord, Lukas Czerner,
	linux-ext4, Edward Shishkin, Christoph Hellwig

On 04/21/2010 05:01 PM, Eric Sandeen wrote:
> On 04/21/2010 03:44 PM, Greg Freemyer wrote:
>
>    
>> Mark's benchmarks showed this as doable in seconds which seems like a
>> reasonable amount of time for a mount time operation.
>>      
> All the other things aside, mount-time is interesting, but it's an
> infrequent operation, at least in my world.  I think we need something
> that can be done runtime.
>
> For anything with uptime, I don't think it's acceptable to wait until
> the next mount to trim unused blocks.
>
> But as long as the mechanism can be called either at mount time and/or
> kicked off runtime somehow, I'm happy.
>
> -Eric
>    

That makes sense to me.  Most enterprise servers will go without 
remounting a file system for (hopefully!) a very long time.

It is really important to keep in mind that this is not just a laptop 
feature for laptop SSD's, this is also used by high end arrays and 
*could* be useful for virt IO, etc as well :-)

ric


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 21:03                     ` Ric Wheeler
@ 2010-04-21 21:47                       ` Greg Freemyer
  2010-04-21 21:56                         ` James Bottomley
  2010-04-21 21:59                         ` Mark Lord
  0 siblings, 2 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-21 21:47 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: sandeen, Eric Sandeen, Jeff Moyer, Mark Lord, Lukas Czerner,
	linux-ext4, Edward Shishkin, Christoph Hellwig, James Bottomley

Adding James Bottomley because high-end scsi is entering the
discussion.  James, I have a couple scsi questions for you at the end.

On Wed, Apr 21, 2010 at 5:03 PM, Ric Wheeler <rwheeler@redhat.com> wrote:
> On 04/21/2010 05:01 PM, Eric Sandeen wrote:
>>
>> On 04/21/2010 03:44 PM, Greg Freemyer wrote:
>>
>>
>>>
>>> Mark's benchmarks showed this as doable in seconds which seems like a
>>> reasonable amount of time for a mount time operation.
>>>
>>
>> All the other things aside, mount-time is interesting, but it's an
>> infrequent operation, at least in my world.  I think we need something
>> that can be done runtime.
>>
>> For anything with uptime, I don't think it's acceptable to wait until
>> the next mount to trim unused blocks.
>>
>> But as long as the mechanism can be called either at mount time and/or
>> kicked off runtime somehow, I'm happy.
>>
>> -Eric
>>
>
> That makes sense to me.  Most enterprise servers will go without remounting
> a file system for (hopefully!) a very long time.
>
> It is really important to keep in mind that this is not just a laptop
> feature for laptop SSD's, this is also used by high end arrays and *could*
> be useful for virt IO, etc as well :-)
>
> ric

I'm not arguing that a runtime solution is not needed.

I'm arguing that at least for SSD backed filesystems Mark's userspace
implementation shows how the mount time initialization of the runtime
bitmap can be accomplished in a few seconds by leveraging the hardware
and using vector'ed trims as opposed to having to build an additional
on-disk structure.

At least for SSDs, the primary purpose of the proposed on-disk
structure seems to be to overcome the current lack of a vector'ed
discard implementation.

If it is too difficult to implement a fully functional vector'ed
discard in the block layer due to locking issues, possibly a special
purpose version could be written that is only used at mount time when
one can be assured no other i/o is occurring to the filesystem.

James,

The ATA-8 spec. supports vectored trims and requires a minimum of 255
sectors worth of range payload be supported.  That equates to a single
trim being able to trim thousands of ranges in one command.

Mark Lord has benchmarked in found a vectored trim to be drastically
faster than calling trim individually for each of those ranges.

Does scsi support vector'ed discard? (ie. write-same commands)

Or are high-end scsi arrays so fast they can process tens of thousands
of discard commands in a reasonable amount of time, unlike the SSDs
have so far proven to do.

It would be interesting to find out that a SSD can discard thousands
of ranges drastically faster than a high-end scsi device can.  But if
true, that might argue for the on-disk bitmap to track previously
discarded blocks/extents.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 21:47                       ` Greg Freemyer
@ 2010-04-21 21:56                         ` James Bottomley
  2010-04-21 21:59                         ` Mark Lord
  1 sibling, 0 replies; 53+ messages in thread
From: James Bottomley @ 2010-04-21 21:56 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Ric Wheeler, sandeen, Eric Sandeen, Jeff Moyer, Mark Lord,
	Lukas Czerner, linux-ext4, Edward Shishkin, Christoph Hellwig

On Wed, 2010-04-21 at 17:47 -0400, Greg Freemyer wrote:
> Adding James Bottomley because high-end scsi is entering the
> discussion.  James, I have a couple scsi questions for you at the end.
> 
> On Wed, Apr 21, 2010 at 5:03 PM, Ric Wheeler <rwheeler@redhat.com> wrote:
> > On 04/21/2010 05:01 PM, Eric Sandeen wrote:
> >>
> >> On 04/21/2010 03:44 PM, Greg Freemyer wrote:
> >>
> >>
> >>>
> >>> Mark's benchmarks showed this as doable in seconds which seems like a
> >>> reasonable amount of time for a mount time operation.
> >>>
> >>
> >> All the other things aside, mount-time is interesting, but it's an
> >> infrequent operation, at least in my world.  I think we need something
> >> that can be done runtime.
> >>
> >> For anything with uptime, I don't think it's acceptable to wait until
> >> the next mount to trim unused blocks.

So what's wrong with using wiper.sh?  It can do online discard of
filesystems that support delayed allocation (ext4, xfs etc.)?

> >> But as long as the mechanism can be called either at mount time and/or
> >> kicked off runtime somehow, I'm happy.
> >>
> >> -Eric
> >>
> >
> > That makes sense to me.  Most enterprise servers will go without remounting
> > a file system for (hopefully!) a very long time.
> >
> > It is really important to keep in mind that this is not just a laptop
> > feature for laptop SSD's, this is also used by high end arrays and *could*
> > be useful for virt IO, etc as well :-)
> >
> > ric
> 
> I'm not arguing that a runtime solution is not needed.
> 
> I'm arguing that at least for SSD backed filesystems Mark's userspace
> implementation shows how the mount time initialization of the runtime
> bitmap can be accomplished in a few seconds by leveraging the hardware
> and using vector'ed trims as opposed to having to build an additional
> on-disk structure.
> 
> At least for SSDs, the primary purpose of the proposed on-disk
> structure seems to be to overcome the current lack of a vector'ed
> discard implementation.
> 
> If it is too difficult to implement a fully functional vector'ed
> discard in the block layer due to locking issues, possibly a special
> purpose version could be written that is only used at mount time when
> one can be assured no other i/o is occurring to the filesystem.
> 
> James,
> 
> The ATA-8 spec. supports vectored trims and requires a minimum of 255
> sectors worth of range payload be supported.  That equates to a single
> trim being able to trim thousands of ranges in one command.
> 
> Mark Lord has benchmarked in found a vectored trim to be drastically
> faster than calling trim individually for each of those ranges.
> 
> Does scsi support vector'ed discard? (ie. write-same commands)

only with UNMAP.  WRITE SAME is effectively single range.

> Or are high-end scsi arrays so fast they can process tens of thousands
> of discard commands in a reasonable amount of time, unlike the SSDs
> have so far proven to do.

No ... they actually have two problems: firstly they can only use
discard ranges which align with their internal block size (usually
something huge like 3/4MB) and then a trim operation tends to be O(1)
and slow, so they'd actually like discard accumulation.

> It would be interesting to find out that a SSD can discard thousands
> of ranges drastically faster than a high-end scsi device can.  But if
> true, that might argue for the on-disk bitmap to track previously
> discarded blocks/extents.

I think SSDs and Arrays both have discard problems, arrays more to do
with the time and expense of the operation, SSDs because the TRIM
command isn't queued.

James



^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 21:47                       ` Greg Freemyer
  2010-04-21 21:56                         ` James Bottomley
@ 2010-04-21 21:59                         ` Mark Lord
  1 sibling, 0 replies; 53+ messages in thread
From: Mark Lord @ 2010-04-21 21:59 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Ric Wheeler, sandeen, Eric Sandeen, Jeff Moyer, Lukas Czerner,
	linux-ext4, Edward Shishkin, Christoph Hellwig, James Bottomley

On 21/04/10 05:47 PM, Greg Freemyer wrote:
>
> The ATA-8 spec. supports vectored trims and requires a minimum of 255
> sectors worth of range payload be supported.  That equates to a single
> trim being able to trim thousands of ranges in one command.
>
> Mark Lord has benchmarked in found a vectored trim to be drastically
> faster than calling trim individually for each of those ranges.
..

Does anyone have an Intel-based SSD they could spare?  :)

Rumour has it they they do not comply with the ATA-8 spec for TRIM,
in that they don't support more than one "sector" of range data.
The SSDs I have here all support much more than that.

Still, even if we just do a single 512-byte payload of TRIM ranges,
it would be a huge win.

Cheers

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-21 20:44                 ` Greg Freemyer
  2010-04-21 20:53                   ` Greg Freemyer
  2010-04-21 21:01                   ` Eric Sandeen
@ 2010-04-23  8:23                   ` Lukas Czerner
  2010-04-24 13:24                     ` Greg Freemyer
  2010-04-26 16:55                     ` Jan Kara
  2 siblings, 2 replies; 53+ messages in thread
From: Lukas Czerner @ 2010-04-23  8:23 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Jeff Moyer, Ric Wheeler, Eric Sandeen, Mark Lord, Lukas Czerner,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5024 bytes --]

On Wed, 21 Apr 2010, Greg Freemyer wrote:

> >>>>> Mmm.. If that's what it is doing, then this patch set would be a
> >>>>> complete disaster.
> >>>>> It would take *hours* to do the initial TRIM.
> >
> > Except it doesn't.  Lukas did provide numbers in his original email.
> >
> 
> Looking at the benchmarks (for the first time) at
> http://people.redhat.com/jmoyer/discard/ext4_batched_discard/
> 
> I don't see anything that says how long the proposed trim ioctl takes
> to complete on the full filesystem.

Well, it strongly depends on how is the file system fragmented. On the
fresh file system (147G) the initial ioctl takes 2 seconds to finish (it 
may be worth to mention that on another SSD (111G) it takes 56s). I will
try to get some numbers for the "usual" file system (not full, not
fresh).

> 
> What they do show is that with the 3 test SSDs used for this
> benchmark, the current released discard implementation is a net loss.
> ie. You are better off running without the discards for all 3 vendors.
>  (at least under the conditions tested.)
> 
> After the patch is applied and optimizing the discards to large free
> extents only, it works out to same performance with or without the
> discards.  ie. no net gain or loss.
> 
> That is extremely cool because one assumes that the non-discard case
> would degrade over time, but that the discard case will not.
> 
> So that argues for the current proposed patch going in.
> 
> But quoting from the first email:
> 
> ==
> The basic idea behind my discard support is to create an ioctl which
> walks through all the free extents in each allocating group and discard
> those extents. As an addition to improve its performance one can specify
> minimum free extent length, so ioctl will not bother with shorter extents.
> 
> This of course means, that with each invocation the ioctl must walk
> through whole file system, checking and discarding free extents, which
> is not very efficient. The best way to avoid this is to keep track of
> deleted (freed) blocks. Then the ioctl have to trim just those free
> extents which were recently freed.
> 
> In order to implement this I have added new bitmap into ext4_group_info
> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
> walk through bb_bitmap_deleted, compare deleted extents with free
> extents trim them and then removes it from the bb_bitmap_deleted.
> 
> But you may notice, that there is one problem. bb_bitmap_deleted does
> not survive umount. To bypass the problem the first ioctl call have to
> walk through whole file system trimming all free extents. But there is a
> better solution to this problem. The bb_bitmap_deleted can be stored on
> disk an can be restored in mount time along with other bitmaps, but I
> think it is a quite big change and should be discussed further.
> ==
> 
> The above seems to argue against the patch going in until the
> mount/umount issues are addressed.

I do not know much about how production system is being used, but I
doubt this is that big issue. Sure the initial ioctl takes long to
finish and there is a place for improvement, there was a proposal to
do the initial trim at mount time. I do not think that it is wise,
why to block mount, when the trim can be run at background when the fs
is mounted ? Of course there will be some performance loss, while ioctl
will be in progress, but it will not block.

There are also another way to overcome this problem. We can assure that
the file system is left trimmed after umount. To do this, we can simply
trim the fs at umount time. I think this won't be any problem and we even
do not prolong the umount time too much, because we will not trim whole
fs, but just recently freed blocks.

This of course bring another problem, when the system is not properly
terminated and the umount is not properly finished (or done at all). But
this can be solved in fsck at boot time I think. This will entirely
eliminate the need to trim the whole fs (except the fsck obviously),
since it is done when fs is created.

> 
> So in addition to this patch, Lukas is proposing a on disk change to
> address the fact that calling trim upteen times at mount time is too
> slow.
> 
> Per Mark's testing of last summer, an alternative solution is to use a
> vectored trim approach that is far more efficient.

Vectored trim will be great, I did not tested anything like that but
obviously it will strongly reduce time needed to trim the fs. But we do
not have this support just yet.

> 
> Mark's benchmarks showed this as doable in seconds which seems like a
> reasonable amount of time for a mount time operation.
> 
> Greg
> 

And also, currently I am rewriting the patch do use rbtree instead of the
bitmap, because there were some concerns of memory consumption. It is a
question whether or not the rbtree will be more memory friendly.
Generally I think that in most "normal" cases it will, but there are some
extreme scenarios, where the rbtree will be much worse. Any comment on
this ?

Thanks.
-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-23  8:23                   ` Lukas Czerner
@ 2010-04-24 13:24                     ` Greg Freemyer
  2010-04-24 13:48                       ` Ric Wheeler
  2010-04-24 18:39                       ` Martin K. Petersen
  2010-04-26 16:55                     ` Jan Kara
  1 sibling, 2 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-24 13:24 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Jeff Moyer, Ric Wheeler, Eric Sandeen, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On Fri, Apr 23, 2010 at 4:23 AM, Lukas Czerner <lczerner@redhat.com> wrote:
> On Wed, 21 Apr 2010, Greg Freemyer wrote:
>
>> >>>>> Mmm.. If that's what it is doing, then this patch set would be a
>> >>>>> complete disaster.
>> >>>>> It would take *hours* to do the initial TRIM.
>> >
>> > Except it doesn't.  Lukas did provide numbers in his original email.
>> >
>>
>> Looking at the benchmarks (for the first time) at
>> http://people.redhat.com/jmoyer/discard/ext4_batched_discard/
>>
>> I don't see anything that says how long the proposed trim ioctl takes
>> to complete on the full filesystem.
>
> Well, it strongly depends on how is the file system fragmented. On the
> fresh file system (147G) the initial ioctl takes 2 seconds to finish (it
> may be worth to mention that on another SSD (111G) it takes 56s). I will
> try to get some numbers for the "usual" file system (not full, not
> fresh).
>
>>
>> What they do show is that with the 3 test SSDs used for this
>> benchmark, the current released discard implementation is a net loss.
>> ie. You are better off running without the discards for all 3 vendors.
>>  (at least under the conditions tested.)
>>
>> After the patch is applied and optimizing the discards to large free
>> extents only, it works out to same performance with or without the
>> discards.  ie. no net gain or loss.
>>
>> That is extremely cool because one assumes that the non-discard case
>> would degrade over time, but that the discard case will not.
>>
>> So that argues for the current proposed patch going in.
>>
>> But quoting from the first email:
>>
>> ==
>> The basic idea behind my discard support is to create an ioctl which
>> walks through all the free extents in each allocating group and discard
>> those extents. As an addition to improve its performance one can specify
>> minimum free extent length, so ioctl will not bother with shorter extents.
>>
>> This of course means, that with each invocation the ioctl must walk
>> through whole file system, checking and discarding free extents, which
>> is not very efficient. The best way to avoid this is to keep track of
>> deleted (freed) blocks. Then the ioctl have to trim just those free
>> extents which were recently freed.
>>
>> In order to implement this I have added new bitmap into ext4_group_info
>> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
>> walk through bb_bitmap_deleted, compare deleted extents with free
>> extents trim them and then removes it from the bb_bitmap_deleted.
>>
>> But you may notice, that there is one problem. bb_bitmap_deleted does
>> not survive umount. To bypass the problem the first ioctl call have to
>> walk through whole file system trimming all free extents. But there is a
>> better solution to this problem. The bb_bitmap_deleted can be stored on
>> disk an can be restored in mount time along with other bitmaps, but I
>> think it is a quite big change and should be discussed further.
>> ==
>>
>> The above seems to argue against the patch going in until the
>> mount/umount issues are addressed.
>
> I do not know much about how production system is being used, but I
> doubt this is that big issue. Sure the initial ioctl takes long to
> finish and there is a place for improvement, there was a proposal to
> do the initial trim at mount time. I do not think that it is wise,
> why to block mount, when the trim can be run at background when the fs
> is mounted ? Of course there will be some performance loss, while ioctl
> will be in progress, but it will not block.
>
> There are also another way to overcome this problem. We can assure that
> the file system is left trimmed after umount. To do this, we can simply
> trim the fs at umount time. I think this won't be any problem and we even
> do not prolong the umount time too much, because we will not trim whole
> fs, but just recently freed blocks.
>
> This of course bring another problem, when the system is not properly
> terminated and the umount is not properly finished (or done at all). But
> this can be solved in fsck at boot time I think. This will entirely
> eliminate the need to trim the whole fs (except the fsck obviously),
> since it is done when fs is created.
>
>>
>> So in addition to this patch, Lukas is proposing a on disk change to
>> address the fact that calling trim upteen times at mount time is too
>> slow.
>>
>> Per Mark's testing of last summer, an alternative solution is to use a
>> vectored trim approach that is far more efficient.
>
> Vectored trim will be great, I did not tested anything like that but
> obviously it will strongly reduce time needed to trim the fs. But we do
> not have this support just yet.
>
>>
>> Mark's benchmarks showed this as doable in seconds which seems like a
>> reasonable amount of time for a mount time operation.
>>
>> Greg
>>
>
>
> And also, currently I am rewriting the patch do use rbtree instead of the
> bitmap, because there were some concerns of memory consumption. It is a
> question whether or not the rbtree will be more memory friendly.
> Generally I think that in most "normal" cases it will, but there are some
> extreme scenarios, where the rbtree will be much worse. Any comment on
> this ?

I know I've been arguing against this patch for the single SSD case
and I still think that use case should be handled by userspace as
hdparm/wiper.sh currently does.  In particular for those extreme
scenarios with JBOD SSDs, the user space solution wins because it
knows how to optimize the trim calls via vectorized ranges in the
payload.

Thus I think the community and distro's should be testing that pair
and pushing it out in the distro's for typical laptop use.

But, that still leaves high-end external raid arrays, mdraid, and lvm
unaddressed.

Those use cases will likely benefit from the approach this patch takes
the most.  In particular, mdraid with raid 5/6  requires an approach
like this patch provides, or it has to create its own in kernel
discard aggregator which seems like a waste of time.

In general, those use cases have large minimum discard units.  Thus I
think this patch should be tuned to work with large discard units and
ignore small ones.  That means it needs to get the underlying block
layer topology and ignore unused space smaller than underlying layers
minimum discard unit.  That should allow a rb tree to be used and
eliminate the extreme scenarios.  (ie. I assume your extreme scenarios
involve large numbers of very small unused ranges.)

That may mean the topology information needs to grow some discard
info.  Does anyone know if that info is easily derived from the
currently existing topo info?

Greg






-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 13:24                     ` Greg Freemyer
@ 2010-04-24 13:48                       ` Ric Wheeler
  2010-04-24 14:30                         ` Greg Freemyer
  2010-04-24 18:39                       ` Martin K. Petersen
  1 sibling, 1 reply; 53+ messages in thread
From: Ric Wheeler @ 2010-04-24 13:48 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Lukas Czerner, Jeff Moyer, Eric Sandeen, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On 04/24/2010 09:24 AM, Greg Freemyer wrote:
> On Fri, Apr 23, 2010 at 4:23 AM, Lukas Czerner<lczerner@redhat.com>  wrote:
>    
>> On Wed, 21 Apr 2010, Greg Freemyer wrote:
>>
>>      
>>>>>>>> Mmm.. If that's what it is doing, then this patch set would be a
>>>>>>>> complete disaster.
>>>>>>>> It would take *hours* to do the initial TRIM.
>>>>>>>>                  
>>>> Except it doesn't.  Lukas did provide numbers in his original email.
>>>>
>>>>          
>>> Looking at the benchmarks (for the first time) at
>>> http://people.redhat.com/jmoyer/discard/ext4_batched_discard/
>>>
>>> I don't see anything that says how long the proposed trim ioctl takes
>>> to complete on the full filesystem.
>>>        
>> Well, it strongly depends on how is the file system fragmented. On the
>> fresh file system (147G) the initial ioctl takes 2 seconds to finish (it
>> may be worth to mention that on another SSD (111G) it takes 56s). I will
>> try to get some numbers for the "usual" file system (not full, not
>> fresh).
>>
>>      
>>> What they do show is that with the 3 test SSDs used for this
>>> benchmark, the current released discard implementation is a net loss.
>>> ie. You are better off running without the discards for all 3 vendors.
>>>   (at least under the conditions tested.)
>>>
>>> After the patch is applied and optimizing the discards to large free
>>> extents only, it works out to same performance with or without the
>>> discards.  ie. no net gain or loss.
>>>
>>> That is extremely cool because one assumes that the non-discard case
>>> would degrade over time, but that the discard case will not.
>>>
>>> So that argues for the current proposed patch going in.
>>>
>>> But quoting from the first email:
>>>
>>> ==
>>> The basic idea behind my discard support is to create an ioctl which
>>> walks through all the free extents in each allocating group and discard
>>> those extents. As an addition to improve its performance one can specify
>>> minimum free extent length, so ioctl will not bother with shorter extents.
>>>
>>> This of course means, that with each invocation the ioctl must walk
>>> through whole file system, checking and discarding free extents, which
>>> is not very efficient. The best way to avoid this is to keep track of
>>> deleted (freed) blocks. Then the ioctl have to trim just those free
>>> extents which were recently freed.
>>>
>>> In order to implement this I have added new bitmap into ext4_group_info
>>> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
>>> walk through bb_bitmap_deleted, compare deleted extents with free
>>> extents trim them and then removes it from the bb_bitmap_deleted.
>>>
>>> But you may notice, that there is one problem. bb_bitmap_deleted does
>>> not survive umount. To bypass the problem the first ioctl call have to
>>> walk through whole file system trimming all free extents. But there is a
>>> better solution to this problem. The bb_bitmap_deleted can be stored on
>>> disk an can be restored in mount time along with other bitmaps, but I
>>> think it is a quite big change and should be discussed further.
>>> ==
>>>
>>> The above seems to argue against the patch going in until the
>>> mount/umount issues are addressed.
>>>        
>> I do not know much about how production system is being used, but I
>> doubt this is that big issue. Sure the initial ioctl takes long to
>> finish and there is a place for improvement, there was a proposal to
>> do the initial trim at mount time. I do not think that it is wise,
>> why to block mount, when the trim can be run at background when the fs
>> is mounted ? Of course there will be some performance loss, while ioctl
>> will be in progress, but it will not block.
>>
>> There are also another way to overcome this problem. We can assure that
>> the file system is left trimmed after umount. To do this, we can simply
>> trim the fs at umount time. I think this won't be any problem and we even
>> do not prolong the umount time too much, because we will not trim whole
>> fs, but just recently freed blocks.
>>
>> This of course bring another problem, when the system is not properly
>> terminated and the umount is not properly finished (or done at all). But
>> this can be solved in fsck at boot time I think. This will entirely
>> eliminate the need to trim the whole fs (except the fsck obviously),
>> since it is done when fs is created.
>>
>>      
>>> So in addition to this patch, Lukas is proposing a on disk change to
>>> address the fact that calling trim upteen times at mount time is too
>>> slow.
>>>
>>> Per Mark's testing of last summer, an alternative solution is to use a
>>> vectored trim approach that is far more efficient.
>>>        
>> Vectored trim will be great, I did not tested anything like that but
>> obviously it will strongly reduce time needed to trim the fs. But we do
>> not have this support just yet.
>>
>>      
>>> Mark's benchmarks showed this as doable in seconds which seems like a
>>> reasonable amount of time for a mount time operation.
>>>
>>> Greg
>>>
>>>        
>>
>> And also, currently I am rewriting the patch do use rbtree instead of the
>> bitmap, because there were some concerns of memory consumption. It is a
>> question whether or not the rbtree will be more memory friendly.
>> Generally I think that in most "normal" cases it will, but there are some
>> extreme scenarios, where the rbtree will be much worse. Any comment on
>> this ?
>>      
> I know I've been arguing against this patch for the single SSD case
> and I still think that use case should be handled by userspace as
> hdparm/wiper.sh currently does.  In particular for those extreme
> scenarios with JBOD SSDs, the user space solution wins because it
> knows how to optimize the trim calls via vectorized ranges in the
> payload.
>    

I think that you have missed the broader point. This is not on by 
default, so you can mount without discard and use whatever user space 
utility you like at your discretion.

ric

> Thus I think the community and distro's should be testing that pair
> and pushing it out in the distro's for typical laptop use.
>
> But, that still leaves high-end external raid arrays, mdraid, and lvm
> unaddressed.
>
> Those use cases will likely benefit from the approach this patch takes
> the most.  In particular, mdraid with raid 5/6  requires an approach
> like this patch provides, or it has to create its own in kernel
> discard aggregator which seems like a waste of time.
>
> In general, those use cases have large minimum discard units.  Thus I
> think this patch should be tuned to work with large discard units and
> ignore small ones.  That means it needs to get the underlying block
> layer topology and ignore unused space smaller than underlying layers
> minimum discard unit.  That should allow a rb tree to be used and
> eliminate the extreme scenarios.  (ie. I assume your extreme scenarios
> involve large numbers of very small unused ranges.)
>
> That may mean the topology information needs to grow some discard
> info.  Does anyone know if that info is easily derived from the
> currently existing topo info?
>
> Greg
>
>
>
>
>
>
>    


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 13:48                       ` Ric Wheeler
@ 2010-04-24 14:30                         ` Greg Freemyer
  2010-04-24 14:43                           ` Eric Sandeen
  0 siblings, 1 reply; 53+ messages in thread
From: Greg Freemyer @ 2010-04-24 14:30 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Lukas Czerner, Jeff Moyer, Eric Sandeen, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On Sat, Apr 24, 2010 at 9:48 AM, Ric Wheeler <rwheeler@redhat.com> wrote:
> On 04/24/2010 09:24 AM, Greg Freemyer wrote:
>>
>> On Fri, Apr 23, 2010 at 4:23 AM, Lukas Czerner<lczerner@redhat.com>
>>  wrote:
>>
>>>
>>> On Wed, 21 Apr 2010, Greg Freemyer wrote:
>>>
>>>
>>>>>>>>>
>>>>>>>>> Mmm.. If that's what it is doing, then this patch set would be a
>>>>>>>>> complete disaster.
>>>>>>>>> It would take *hours* to do the initial TRIM.
>>>>>>>>>
>>>>>
>>>>> Except it doesn't.  Lukas did provide numbers in his original email.
>>>>>
>>>>>
>>>>
>>>> Looking at the benchmarks (for the first time) at
>>>> http://people.redhat.com/jmoyer/discard/ext4_batched_discard/
>>>>
>>>> I don't see anything that says how long the proposed trim ioctl takes
>>>> to complete on the full filesystem.
>>>>
>>>
>>> Well, it strongly depends on how is the file system fragmented. On the
>>> fresh file system (147G) the initial ioctl takes 2 seconds to finish (it
>>> may be worth to mention that on another SSD (111G) it takes 56s). I will
>>> try to get some numbers for the "usual" file system (not full, not
>>> fresh).
>>>
>>>
>>>>
>>>> What they do show is that with the 3 test SSDs used for this
>>>> benchmark, the current released discard implementation is a net loss.
>>>> ie. You are better off running without the discards for all 3 vendors.
>>>>  (at least under the conditions tested.)
>>>>
>>>> After the patch is applied and optimizing the discards to large free
>>>> extents only, it works out to same performance with or without the
>>>> discards.  ie. no net gain or loss.
>>>>
>>>> That is extremely cool because one assumes that the non-discard case
>>>> would degrade over time, but that the discard case will not.
>>>>
>>>> So that argues for the current proposed patch going in.
>>>>
>>>> But quoting from the first email:
>>>>
>>>> ==
>>>> The basic idea behind my discard support is to create an ioctl which
>>>> walks through all the free extents in each allocating group and discard
>>>> those extents. As an addition to improve its performance one can specify
>>>> minimum free extent length, so ioctl will not bother with shorter
>>>> extents.
>>>>
>>>> This of course means, that with each invocation the ioctl must walk
>>>> through whole file system, checking and discarding free extents, which
>>>> is not very efficient. The best way to avoid this is to keep track of
>>>> deleted (freed) blocks. Then the ioctl have to trim just those free
>>>> extents which were recently freed.
>>>>
>>>> In order to implement this I have added new bitmap into ext4_group_info
>>>> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
>>>> walk through bb_bitmap_deleted, compare deleted extents with free
>>>> extents trim them and then removes it from the bb_bitmap_deleted.
>>>>
>>>> But you may notice, that there is one problem. bb_bitmap_deleted does
>>>> not survive umount. To bypass the problem the first ioctl call have to
>>>> walk through whole file system trimming all free extents. But there is a
>>>> better solution to this problem. The bb_bitmap_deleted can be stored on
>>>> disk an can be restored in mount time along with other bitmaps, but I
>>>> think it is a quite big change and should be discussed further.
>>>> ==
>>>>
>>>> The above seems to argue against the patch going in until the
>>>> mount/umount issues are addressed.
>>>>
>>>
>>> I do not know much about how production system is being used, but I
>>> doubt this is that big issue. Sure the initial ioctl takes long to
>>> finish and there is a place for improvement, there was a proposal to
>>> do the initial trim at mount time. I do not think that it is wise,
>>> why to block mount, when the trim can be run at background when the fs
>>> is mounted ? Of course there will be some performance loss, while ioctl
>>> will be in progress, but it will not block.
>>>
>>> There are also another way to overcome this problem. We can assure that
>>> the file system is left trimmed after umount. To do this, we can simply
>>> trim the fs at umount time. I think this won't be any problem and we even
>>> do not prolong the umount time too much, because we will not trim whole
>>> fs, but just recently freed blocks.
>>>
>>> This of course bring another problem, when the system is not properly
>>> terminated and the umount is not properly finished (or done at all). But
>>> this can be solved in fsck at boot time I think. This will entirely
>>> eliminate the need to trim the whole fs (except the fsck obviously),
>>> since it is done when fs is created.
>>>
>>>
>>>>
>>>> So in addition to this patch, Lukas is proposing a on disk change to
>>>> address the fact that calling trim upteen times at mount time is too
>>>> slow.
>>>>
>>>> Per Mark's testing of last summer, an alternative solution is to use a
>>>> vectored trim approach that is far more efficient.
>>>>
>>>
>>> Vectored trim will be great, I did not tested anything like that but
>>> obviously it will strongly reduce time needed to trim the fs. But we do
>>> not have this support just yet.
>>>
>>>
>>>>
>>>> Mark's benchmarks showed this as doable in seconds which seems like a
>>>> reasonable amount of time for a mount time operation.
>>>>
>>>> Greg
>>>>
>>>>
>>>
>>> And also, currently I am rewriting the patch do use rbtree instead of the
>>> bitmap, because there were some concerns of memory consumption. It is a
>>> question whether or not the rbtree will be more memory friendly.
>>> Generally I think that in most "normal" cases it will, but there are some
>>> extreme scenarios, where the rbtree will be much worse. Any comment on
>>> this ?
>>>
>>
>> I know I've been arguing against this patch for the single SSD case
>> and I still think that use case should be handled by userspace as
>> hdparm/wiper.sh currently does.  In particular for those extreme
>> scenarios with JBOD SSDs, the user space solution wins because it
>> knows how to optimize the trim calls via vectorized ranges in the
>> payload.
>>
>
> I think that you have missed the broader point. This is not on by default,
> so you can mount without discard and use whatever user space utility you
> like at your discretion.
>
> ric

Ric,

I was trying to say the design should be driven by the large discard
range use case, not the potentially pathological small discard range
use case that would only benefit SSDs.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 14:30                         ` Greg Freemyer
@ 2010-04-24 14:43                           ` Eric Sandeen
  2010-04-24 15:03                             ` Greg Freemyer
  0 siblings, 1 reply; 53+ messages in thread
From: Eric Sandeen @ 2010-04-24 14:43 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Ric Wheeler, Lukas Czerner, Jeff Moyer, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

Greg Freemyer wrote:
> On Sat, Apr 24, 2010 at 9:48 AM, Ric Wheeler <rwheeler@redhat.com> wrote:
>> On 04/24/2010 09:24 AM, Greg Freemyer wrote:

...

>>> I know I've been arguing against this patch for the single SSD case
>>> and I still think that use case should be handled by userspace as
>>> hdparm/wiper.sh currently does.  In particular for those extreme
>>> scenarios with JBOD SSDs, the user space solution wins because it
>>> knows how to optimize the trim calls via vectorized ranges in the
>>> payload.
>>>
>> I think that you have missed the broader point. This is not on by default,
>> so you can mount without discard and use whatever user space utility you
>> like at your discretion.
>>
>> ric
> 
> Ric,
> 
> I was trying to say the design should be driven by the large discard
> range use case, not the potentially pathological small discard range
> use case that would only benefit SSDs.
> 
> Greg

Bear in mind that this patch makes the discard range requests substantially
-larger- than what mount -o discard does on ext4 today, in fact that was
a main goal.

If the kernel could assemble vectors of ranges and pass them down, I think it
could be extended to use that as well.

-Eric

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 14:43                           ` Eric Sandeen
@ 2010-04-24 15:03                             ` Greg Freemyer
  2010-04-24 17:04                               ` Ric Wheeler
  0 siblings, 1 reply; 53+ messages in thread
From: Greg Freemyer @ 2010-04-24 15:03 UTC (permalink / raw)
  To: Eric Sandeen
  Cc: Ric Wheeler, Lukas Czerner, Jeff Moyer, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On Sat, Apr 24, 2010 at 10:43 AM, Eric Sandeen <sandeen@redhat.com> wrote:
> Greg Freemyer wrote:
>> On Sat, Apr 24, 2010 at 9:48 AM, Ric Wheeler <rwheeler@redhat.com> wrote:
>>> On 04/24/2010 09:24 AM, Greg Freemyer wrote:
>
> ...
>
>>>> I know I've been arguing against this patch for the single SSD case
>>>> and I still think that use case should be handled by userspace as
>>>> hdparm/wiper.sh currently does.  In particular for those extreme
>>>> scenarios with JBOD SSDs, the user space solution wins because it
>>>> knows how to optimize the trim calls via vectorized ranges in the
>>>> payload.
>>>>
>>> I think that you have missed the broader point. This is not on by default,
>>> so you can mount without discard and use whatever user space utility you
>>> like at your discretion.
>>>
>>> ric
>>
>> Ric,
>>
>> I was trying to say the design should be driven by the large discard
>> range use case, not the potentially pathological small discard range
>> use case that would only benefit SSDs.
>>
>> Greg
>
> Bear in mind that this patch makes the discard range requests substantially
> -larger- than what mount -o discard does on ext4 today, in fact that was
> a main goal.
>
> If the kernel could assemble vectors of ranges and pass them down, I think it
> could be extended to use that as well.
>
> -Eric
>
Eric/Ric,

I was responding to the Lukas latest post which stated:

==
And also, currently I am rewriting the patch do use rbtree instead of the
bitmap, because there were some concerns of memory consumption. It is a
question whether or not the rbtree will be more memory friendly.
Generally I think that in most "normal" cases it will, but there are some
extreme scenarios, where the rbtree will be much worse. Any comment on
this ?
==

If one optimizes for large discard ranges and ignores the pathological
cases only beneficial to SSDs, then a rbtree wins.

Correct?

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 15:03                             ` Greg Freemyer
@ 2010-04-24 17:04                               ` Ric Wheeler
  2010-04-24 18:30                                 ` Greg Freemyer
  0 siblings, 1 reply; 53+ messages in thread
From: Ric Wheeler @ 2010-04-24 17:04 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Eric Sandeen, Lukas Czerner, Jeff Moyer, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On 04/24/2010 11:03 AM, Greg Freemyer wrote:
> On Sat, Apr 24, 2010 at 10:43 AM, Eric Sandeen<sandeen@redhat.com>  wrote:
>    
>> Greg Freemyer wrote:
>>      
>>> On Sat, Apr 24, 2010 at 9:48 AM, Ric Wheeler<rwheeler@redhat.com>  wrote:
>>>        
>>>> On 04/24/2010 09:24 AM, Greg Freemyer wrote:
>>>>          
>> ...
>>
>>      
>>>>> I know I've been arguing against this patch for the single SSD case
>>>>> and I still think that use case should be handled by userspace as
>>>>> hdparm/wiper.sh currently does.  In particular for those extreme
>>>>> scenarios with JBOD SSDs, the user space solution wins because it
>>>>> knows how to optimize the trim calls via vectorized ranges in the
>>>>> payload.
>>>>>
>>>>>            
>>>> I think that you have missed the broader point. This is not on by default,
>>>> so you can mount without discard and use whatever user space utility you
>>>> like at your discretion.
>>>>
>>>> ric
>>>>          
>>> Ric,
>>>
>>> I was trying to say the design should be driven by the large discard
>>> range use case, not the potentially pathological small discard range
>>> use case that would only benefit SSDs.
>>>
>>> Greg
>>>        
>> Bear in mind that this patch makes the discard range requests substantially
>> -larger- than what mount -o discard does on ext4 today, in fact that was
>> a main goal.
>>
>> If the kernel could assemble vectors of ranges and pass them down, I think it
>> could be extended to use that as well.
>>
>> -Eric
>>
>>      
> Eric/Ric,
>
> I was responding to the Lukas latest post which stated:
>
> ==
> And also, currently I am rewriting the patch do use rbtree instead of the
> bitmap, because there were some concerns of memory consumption. It is a
> question whether or not the rbtree will be more memory friendly.
> Generally I think that in most "normal" cases it will, but there are some
> extreme scenarios, where the rbtree will be much worse. Any comment on
> this ?
> ==
>
> If one optimizes for large discard ranges and ignores the pathological
> cases only beneficial to SSDs, then a rbtree wins.
>
> Correct?
>
> Greg
>    

Let's summarize.

1. Everyone agrees that doing larger discard instead of little discards 
is a good thing.

2. Some devices care about this more than others (various types of SSD's 
and arrays have different designs and performance with discards). Some 
devices do small discards well, others don't.

3. How you get to those bigger discards in our implementation - using a 
series of single range requests, using vectored requests, tracking 
extents that can be combined in an rbtree or not - is something that we 
are working on. Using rbtrees versus a bitmap efficiency is about DRAM 
consumption, not performance of the resulting discard on the target.

4. Devices (some devices) can export their preferences in a standard way 
(look in /sys/block/....).

If you want to influence the code, please do try the various options on 
devices you have at hand and report results.  That is what we are doing 
(we includes Lukas, Eric, Jeff and others on this thread) will real 
devices from vendors that have given us access. We are talking to them 
directly and trying out different work loads but certainly welcome real 
world results and suggestions.

Thanks!

Ric



^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 17:04                               ` Ric Wheeler
@ 2010-04-24 18:30                                 ` Greg Freemyer
  2010-04-24 18:41                                   ` Ric Wheeler
  2010-04-24 19:06                                   ` Martin K. Petersen
  0 siblings, 2 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-24 18:30 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Eric Sandeen, Lukas Czerner, Jeff Moyer, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On Sat, Apr 24, 2010 at 1:04 PM, Ric Wheeler <rwheeler@redhat.com> wrote:
<snip>
>
> Let's summarize.
>
> 1. Everyone agrees that doing larger discard instead of little discards is a
> good thing.
>
> 2. Some devices care about this more than others (various types of SSD's and
> arrays have different designs and performance with discards). Some devices
> do small discards well, others don't.
>
> 3. How you get to those bigger discards in our implementation - using a
> series of single range requests, using vectored requests, tracking extents
> that can be combined in an rbtree or not - is something that we are working
> on. Using rbtrees versus a bitmap efficiency is about DRAM consumption, not
> performance of the resulting discard on the target.
>
> 4. Devices (some devices) can export their preferences in a standard way
> (look in /sys/block/....).
>
> If you want to influence the code, please do try the various options on
> devices you have at hand and report results.  That is what we are doing (we
> includes Lukas, Eric, Jeff and others on this thread) will real devices from
> vendors that have given us access. We are talking to them directly and
> trying out different work loads but certainly welcome real world results and
> suggestions.
>
> Thanks!
>
> Ric

Ric,

Is it also agreed by all that the current ext4 kernel implementation
of calling discard is a poor solution for most hardware / block layers
stacks / workloads and therefore is not worth retaining nor performing
further benchmarks?

I've not seen anyone arguing to keep the current kernel implementation
and I for one accept the previously posted benchmarks that show it is
not adding any value relative to the traditional non-discard case.

Therefore benchmarks between the current hdparm/wiper.sh userspace
implementation and a proposed new kernel implementation would be the
most beneficial?

I have yet to see any of those benchmarks posted.

Greg



-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 13:24                     ` Greg Freemyer
  2010-04-24 13:48                       ` Ric Wheeler
@ 2010-04-24 18:39                       ` Martin K. Petersen
  1 sibling, 0 replies; 53+ messages in thread
From: Martin K. Petersen @ 2010-04-24 18:39 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Lukas Czerner, Jeff Moyer, Ric Wheeler, Eric Sandeen, Mark Lord,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

>>>>> "Greg" == Greg Freemyer <greg.freemyer@gmail.com> writes:

Greg> That may mean the topology information needs to grow some discard
Greg> info.  Does anyone know if that info is easily derived from the
Greg> currently existing topo info?

The values are already part of the topology information (assuming the
device reports them).  Went into 2.6.33.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 18:30                                 ` Greg Freemyer
@ 2010-04-24 18:41                                   ` Ric Wheeler
  2010-04-26 14:00                                     ` Mark Lord
  2010-04-24 19:06                                   ` Martin K. Petersen
  1 sibling, 1 reply; 53+ messages in thread
From: Ric Wheeler @ 2010-04-24 18:41 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Eric Sandeen, Lukas Czerner, Jeff Moyer, Mark Lord, linux-ext4,
	Edward Shishkin, Eric Sandeen, Christoph Hellwig

On 04/24/2010 02:30 PM, Greg Freemyer wrote:
> On Sat, Apr 24, 2010 at 1:04 PM, Ric Wheeler<rwheeler@redhat.com>  wrote:
> <snip>
>    
>> Let's summarize.
>>
>> 1. Everyone agrees that doing larger discard instead of little discards is a
>> good thing.
>>
>> 2. Some devices care about this more than others (various types of SSD's and
>> arrays have different designs and performance with discards). Some devices
>> do small discards well, others don't.
>>
>> 3. How you get to those bigger discards in our implementation - using a
>> series of single range requests, using vectored requests, tracking extents
>> that can be combined in an rbtree or not - is something that we are working
>> on. Using rbtrees versus a bitmap efficiency is about DRAM consumption, not
>> performance of the resulting discard on the target.
>>
>> 4. Devices (some devices) can export their preferences in a standard way
>> (look in /sys/block/....).
>>
>> If you want to influence the code, please do try the various options on
>> devices you have at hand and report results.  That is what we are doing (we
>> includes Lukas, Eric, Jeff and others on this thread) will real devices from
>> vendors that have given us access. We are talking to them directly and
>> trying out different work loads but certainly welcome real world results and
>> suggestions.
>>
>> Thanks!
>>
>> Ric
>>      
> Ric,
>
> Is it also agreed by all that the current ext4 kernel implementation
> of calling discard is a poor solution for most hardware / block layers
> stacks / workloads and therefore is not worth retaining nor performing
> further benchmarks?
>
> I've not seen anyone arguing to keep the current kernel implementation
> and I for one accept the previously posted benchmarks that show it is
> not adding any value relative to the traditional non-discard case.
>
> Therefore benchmarks between the current hdparm/wiper.sh userspace
> implementation and a proposed new kernel implementation would be the
> most beneficial?
>
> I have yet to see any of those benchmarks posted.
>
> Greg
>    

Greg,

I don't like the user space wiper.sh approach in general, but running 
wiper.sh is entirely your choice.

Most users prefer having the file system and the IO stack take care of 
this for them, but again, entirely configurable.

The benchmarks we have done are often done on hardware that is under NDA 
so we cannot publish results across the board.

Feel free to run on hardware that you buy and share the results.

Thanks!

Ric


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 18:30                                 ` Greg Freemyer
  2010-04-24 18:41                                   ` Ric Wheeler
@ 2010-04-24 19:06                                   ` Martin K. Petersen
  2010-04-26 14:03                                     ` Mark Lord
  1 sibling, 1 reply; 53+ messages in thread
From: Martin K. Petersen @ 2010-04-24 19:06 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Ric Wheeler, Eric Sandeen, Lukas Czerner, Jeff Moyer, Mark Lord,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

>>>>> "Greg" == Greg Freemyer <greg.freemyer@gmail.com> writes:

Greg> Is it also agreed by all that the current ext4 kernel
Greg> implementation of calling discard is a poor solution for most
Greg> hardware / block layers stacks / workloads and therefore is not
Greg> worth retaining nor performing further benchmarks?

Our discard implementation is meant to accommodate a wide range of
devices.  Just because some of the currently shipping low-end consumer
SSDs implement TRIM poorly does not mean we're going to scrap what we
have.

We are not in the business of designing for the past.  Especially not
when the past can be handled by a shell script.

For future devices TRIM/UNMAP is going to be an inherent part of the
command set.  And that's what our kernel support is aimed at.  There are
some deficiencies right now because the block layer was not built to
handle what is essentially a hybrid between a filesystem and a blk_pc
type request.  I'm working on fixing that.

Greg> I've not seen anyone arguing to keep the current kernel
Greg> implementation and I for one accept the previously posted
Greg> benchmarks that show it is not adding any value relative to the
Greg> traditional non-discard case.

For enterprise storage the cost of sending discards is limited to the
overhead of sending the command.  I.e. negligible.

Eventually that's going to be the case for ATA devices as well.  And let
me just reiterate that Windows 7 does issue TRIM like we do (at
runtime).  And consequently Windows 7 will help weed out the crap SSD
implementations from the market.  That's already happening.

There is also work underway to make TRIM a queueable command which would
further alleviate the situation.

Greg> Therefore benchmarks between the current hdparm/wiper.sh userspace
Greg> implementation and a proposed new kernel implementation would be
Greg> the most beneficial?

I don't know what you mean by "new" kernel implementation.  We're
working on tweaking what we have so that we can split, merge, and
coalesce requests.

I'm also not sure why you're so hung up on benchmarks.  The purpose of
TRIM is to increase longevity of a flash-based device.  For dumb devices
there's currently a tradeoff - do you want speed or endurance?  Once
devices get smarter that tradeoff will disappear.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 18:41                                   ` Ric Wheeler
@ 2010-04-26 14:00                                     ` Mark Lord
  2010-04-26 14:42                                       ` Martin K. Petersen
  0 siblings, 1 reply; 53+ messages in thread
From: Mark Lord @ 2010-04-26 14:00 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Greg Freemyer, Eric Sandeen, Lukas Czerner, Jeff Moyer,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

On 24/04/10 02:41 PM, Ric Wheeler wrote:
..
> I don't like the user space wiper.sh approach in general, but running
> wiper.sh is entirely your choice.
>
> Most users prefer having the file system and the IO stack take care of
> this for them, but again, entirely configurable.
..

I wrote wiper.sh, and I would prefer more of an in-kernel solution,
simply because it could potentially have the smarts to deal with RAID,
LVM, and btrfs's own duplicate implementation of RAID/LVM.

Those cannot be done from userspace on a mounted filesystem.

So.. again.. in an ideal kernel, I'd like to see use of larger ranges
(and _multiple_ ranges) per TRIM command.  And options for the kernel
to do it automatically (-o discard), as well as an ioctl() interface
for userspace to "scrub" (or "wipe") all free ranges in a gradual fashion.

Cheers

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-24 19:06                                   ` Martin K. Petersen
@ 2010-04-26 14:03                                     ` Mark Lord
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Lord @ 2010-04-26 14:03 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Greg Freemyer, Ric Wheeler, Eric Sandeen, Lukas Czerner,
	Jeff Moyer, linux-ext4, Edward Shishkin, Eric Sandeen,
	Christoph Hellwig

On 24/04/10 03:06 PM, Martin K. Petersen wrote:
..
> Our discard implementation is meant to accommodate a wide range of
> devices.  Just because some of the currently shipping low-end consumer
> SSDs implement TRIM poorly does not mean we're going to scrap what we
> have.
..

The current implementation works extremely poorly for the singlemost
common style of hardware that's out there.  Millions and millions
of SATA SSDs, and they're becoming more and more ubiquitous.

A "solution" that ignores reality isn't very helpful.
We can do better than that, a lot better.

> We are not in the business of designing for the past.  Especially not
> when the past can be handled by a shell script.
..

We are in the business of supporting the hardware that people run Linux on.
Today, and tomorrow, and the next few years, that means SATA SSDs by the gazillions,
as well as a relatively smaller number of enterprise behemoths.

A shell script cannot currently deal with LVM, RAID, or btrfs filesystems.

Cheers

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 14:00                                     ` Mark Lord
@ 2010-04-26 14:42                                       ` Martin K. Petersen
  2010-04-26 15:27                                         ` Greg Freemyer
  2010-04-26 15:48                                         ` Ric Wheeler
  0 siblings, 2 replies; 53+ messages in thread
From: Martin K. Petersen @ 2010-04-26 14:42 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ric Wheeler, Greg Freemyer, Eric Sandeen, Lukas Czerner,
	Jeff Moyer, linux-ext4, Edward Shishkin, Eric Sandeen,
	Christoph Hellwig

>>>>> "Mark" == Mark Lord <kernel@teksavvy.com> writes:

Mark> So.. again.. in an ideal kernel, I'd like to see use of larger
Mark> ranges (and _multiple_ ranges) per TRIM command.  And options for
Mark> the kernel to do it automatically (-o discard), as well as an
Mark> ioctl() interface for userspace to "scrub" (or "wipe") all free
Mark> ranges in a gradual fashion.

Again: Discard splitting, merging, and coalescing is coming.  I'm
working on it.  It's not exactly trivial.

My objection was purely wrt. nuking realtime discard in favor of
scrubbing.  We absolutely need both approaches.

But I'm not going to let crappy SSD firmware implementations set the
direction for what I'm working on.  It is much more interesting to
ensure that we work well for the devices that'll be shipping 6-12 months
from now (at the time where this stuff will realistically end up in
distro kernels).

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 14:42                                       ` Martin K. Petersen
@ 2010-04-26 15:27                                         ` Greg Freemyer
  2010-04-26 15:51                                           ` Lukas Czerner
  2010-04-28  1:25                                           ` Mark Lord
  2010-04-26 15:48                                         ` Ric Wheeler
  1 sibling, 2 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-04-26 15:27 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Mark Lord, Ric Wheeler, Eric Sandeen, Lukas Czerner, Jeff Moyer,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

On Mon, Apr 26, 2010 at 10:42 AM, Martin K. Petersen
<martin.petersen@oracle.com> wrote:
>>>>>> "Mark" == Mark Lord <kernel@teksavvy.com> writes:
>
> Mark> So.. again.. in an ideal kernel, I'd like to see use of larger
> Mark> ranges (and _multiple_ ranges) per TRIM command.  And options for
> Mark> the kernel to do it automatically (-o discard), as well as an
> Mark> ioctl() interface for userspace to "scrub" (or "wipe") all free
> Mark> ranges in a gradual fashion.
>
> Again: Discard splitting, merging, and coalescing is coming.  I'm
> working on it.  It's not exactly trivial.
>
> My objection was purely wrt. nuking realtime discard in favor of
> scrubbing.  We absolutely need both approaches.
>
> But I'm not going to let crappy SSD firmware implementations set the
> direction for what I'm working on.  It is much more interesting to
> ensure that we work well for the devices that'll be shipping 6-12 months
> from now (at the time where this stuff will realistically end up in
> distro kernels).
>
> --
> Martin K. Petersen      Oracle Linux Engineering
>

Is this an agreed summary as relates to ext4:

1) Current "-o discard" causes real-time calls to discard.  Although
not optimized for current generation hardware, both block-layer
optimizations and new hardware are coming, so it needs to be kept.

2) Kernel based discard scrubbing - Doesn't currently exist in 2.6.34,
all agree that for the LVM, mdraid, btrfs cases it is needed and there
is no linux tool (kernel or userspace) at present.  The Lukas proposed
patch is userspace invoked so a mount option is not needed.

3) Userspace discard is available today and works well with ext4 on a
single JBOD SSD which will be the typical laptop use as one example.

Mark, or anyone, do you think it would be a bad idea for me to push
for Opensuse 11.3 (2.6.34 based) to use wiper.sh as a standard ext4
discard tool?  hdparm and wiper.sh are already in the distro, it just
needs a cron entry and maybe some basic management infrastructure.
They're at feature freeze for 11.3, so I don't know if I can get it in
or not.

If and when the suse team want to move to the kernel based scrubber,
they can.  ie. a year from now when the next release after 11.3 comes
out.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 14:42                                       ` Martin K. Petersen
  2010-04-26 15:27                                         ` Greg Freemyer
@ 2010-04-26 15:48                                         ` Ric Wheeler
  1 sibling, 0 replies; 53+ messages in thread
From: Ric Wheeler @ 2010-04-26 15:48 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Mark Lord, Greg Freemyer, Eric Sandeen, Lukas Czerner, Jeff Moyer,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

On 04/26/2010 10:42 AM, Martin K. Petersen wrote:
>>>>>> "Mark" == Mark Lord<kernel@teksavvy.com>  writes:
>
> Mark>  So.. again.. in an ideal kernel, I'd like to see use of larger
> Mark>  ranges (and _multiple_ ranges) per TRIM command.  And options for
> Mark>  the kernel to do it automatically (-o discard), as well as an
> Mark>  ioctl() interface for userspace to "scrub" (or "wipe") all free
> Mark>  ranges in a gradual fashion.
>
> Again: Discard splitting, merging, and coalescing is coming.  I'm
> working on it.  It's not exactly trivial.
>
> My objection was purely wrt. nuking realtime discard in favor of
> scrubbing.  We absolutely need both approaches.
>
> But I'm not going to let crappy SSD firmware implementations set the
> direction for what I'm working on.  It is much more interesting to
> ensure that we work well for the devices that'll be shipping 6-12 months
> from now (at the time where this stuff will realistically end up in
> distro kernels).
>

And one thing to keep in mind is that we are likely to need both run time 
support for discard as well as occasional resync in bulk since the storage can 
choose to ignore the command and still be in spec.  Kind of like the unfortunate 
"defrag" thing that windows people do,

ric

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 15:27                                         ` Greg Freemyer
@ 2010-04-26 15:51                                           ` Lukas Czerner
  2010-04-28  1:25                                           ` Mark Lord
  1 sibling, 0 replies; 53+ messages in thread
From: Lukas Czerner @ 2010-04-26 15:51 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Martin K. Petersen, Mark Lord, Ric Wheeler, Eric Sandeen,
	Lukas Czerner, Jeff Moyer, linux-ext4, Edward Shishkin,
	Eric Sandeen, Christoph Hellwig

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2663 bytes --]

On Mon, 26 Apr 2010, Greg Freemyer wrote:

> On Mon, Apr 26, 2010 at 10:42 AM, Martin K. Petersen
> <martin.petersen@oracle.com> wrote:
> >>>>>> "Mark" == Mark Lord <kernel@teksavvy.com> writes:
> >
> > Mark> So.. again.. in an ideal kernel, I'd like to see use of larger
> > Mark> ranges (and _multiple_ ranges) per TRIM command.  And options for
> > Mark> the kernel to do it automatically (-o discard), as well as an
> > Mark> ioctl() interface for userspace to "scrub" (or "wipe") all free
> > Mark> ranges in a gradual fashion.
> >
> > Again: Discard splitting, merging, and coalescing is coming.  I'm
> > working on it.  It's not exactly trivial.
> >
> > My objection was purely wrt. nuking realtime discard in favor of
> > scrubbing.  We absolutely need both approaches.
> >
> > But I'm not going to let crappy SSD firmware implementations set the
> > direction for what I'm working on.  It is much more interesting to
> > ensure that we work well for the devices that'll be shipping 6-12 months
> > from now (at the time where this stuff will realistically end up in
> > distro kernels).
> >
> > --
> > Martin K. Petersen      Oracle Linux Engineering
> >
> 
> Is this an agreed summary as relates to ext4:
> 
> 1) Current "-o discard" causes real-time calls to discard.  Although
> not optimized for current generation hardware, both block-layer
> optimizations and new hardware are coming, so it needs to be kept.
> 
> 2) Kernel based discard scrubbing - Doesn't currently exist in 2.6.34,
> all agree that for the LVM, mdraid, btrfs cases it is needed and there
> is no linux tool (kernel or userspace) at present.  The Lukas proposed
> patch is userspace invoked so a mount option is not needed.

It is userspace invoked indeed, but once you set the nodiscard mount
option it will not do anything. It seems only logical to me to NOT
discard anything, when "nodiscard" option is set.

> 
> 3) Userspace discard is available today and works well with ext4 on a
> single JBOD SSD which will be the typical laptop use as one example.

It (wiper) do not work on devices which does not support vectored trim
ranges.

> 
> Mark, or anyone, do you think it would be a bad idea for me to push
> for Opensuse 11.3 (2.6.34 based) to use wiper.sh as a standard ext4
> discard tool?  hdparm and wiper.sh are already in the distro, it just
> needs a cron entry and maybe some basic management infrastructure.
> They're at feature freeze for 11.3, so I don't know if I can get it in
> or not.
> 
> If and when the suse team want to move to the kernel based scrubber,
> they can.  ie. a year from now when the next release after 11.3 comes
> out.
> 
> Greg
> 

-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-23  8:23                   ` Lukas Czerner
  2010-04-24 13:24                     ` Greg Freemyer
@ 2010-04-26 16:55                     ` Jan Kara
  2010-04-26 17:46                       ` Lukas Czerner
  1 sibling, 1 reply; 53+ messages in thread
From: Jan Kara @ 2010-04-26 16:55 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Greg Freemyer, Jeff Moyer, Ric Wheeler, Eric Sandeen, Mark Lord,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

> On Wed, 21 Apr 2010, Greg Freemyer wrote:
> And also, currently I am rewriting the patch do use rbtree instead of the
> bitmap, because there were some concerns of memory consumption. It is a
> question whether or not the rbtree will be more memory friendly.
> Generally I think that in most "normal" cases it will, but there are some
> extreme scenarios, where the rbtree will be much worse. Any comment on
> this ?
  I see two possible improvements here:
a) At a cost of some code complexity, you can bound the worst case by combining
RB-trees with bitmaps. The basic idea is that when space to TRIM gets too
fragmented (memory to keep to-TRIM blocks in RB-tree for a given group exceeds
the memory needed to keep it in a bitmap), you convert RB-tree for a
problematic group to a bitmap and attach it to an appropriate RB-node. If you
track with a bitmap also a number of to-TRIM extents in the bitmap, you can
also decide whether it's benefitial to switch back to an RB-tree.

b) Another idea might be: When to-TRIM space is fragmented (again, let's say
in some block group), there's not much point in sending tiny trim commands
anyway (at least that's what I've understood from this discussion). So you
might as well stop maintaining information which blocks we need to trim
for that group. When the situation gets better, you can always walk block
bitmap and issue trim commands. You might even trigger this rescan from
kernel - if you'd maintain number of free block extents for each block group
(which is rather easy), you could trigger the bitmap rescan and trim as soon
as ratio number of free blocks / number of extents gets above a reasonable
threshold.

								Honza
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 16:55                     ` Jan Kara
@ 2010-04-26 17:46                       ` Lukas Czerner
  2010-04-26 17:52                         ` Ric Wheeler
  0 siblings, 1 reply; 53+ messages in thread
From: Lukas Czerner @ 2010-04-26 17:46 UTC (permalink / raw)
  To: Jan Kara
  Cc: Lukas Czerner, Greg Freemyer, Jeff Moyer, Ric Wheeler,
	Eric Sandeen, Mark Lord, linux-ext4, Edward Shishkin,
	Eric Sandeen, Christoph Hellwig

On Mon, 26 Apr 2010, Jan Kara wrote:

> > On Wed, 21 Apr 2010, Greg Freemyer wrote:
> > And also, currently I am rewriting the patch do use rbtree instead of the
> > bitmap, because there were some concerns of memory consumption. It is a
> > question whether or not the rbtree will be more memory friendly.
> > Generally I think that in most "normal" cases it will, but there are some
> > extreme scenarios, where the rbtree will be much worse. Any comment on
> > this ?
>   I see two possible improvements here:
> a) At a cost of some code complexity, you can bound the worst case by combining
> RB-trees with bitmaps. The basic idea is that when space to TRIM gets too
> fragmented (memory to keep to-TRIM blocks in RB-tree for a given group exceeds
> the memory needed to keep it in a bitmap), you convert RB-tree for a
> problematic group to a bitmap and attach it to an appropriate RB-node. If you
> track with a bitmap also a number of to-TRIM extents in the bitmap, you can
> also decide whether it's benefitial to switch back to an RB-tree.

This sounds like a good idea, but I wonder if it is worth it :
 1. The tree will have very short life, because with next ioctl all
 stored deleted extents will be trimmed and removed from the tree.
 2. Also note, that the longer it lives the less fragmented it possibly
 became.
 3. I do not expect, that deleted ranges can be too fragmented, and
 even if it is, it will be probably merged into one big extent very
 soon.

> 
> b) Another idea might be: When to-TRIM space is fragmented (again, let's say
> in some block group), there's not much point in sending tiny trim commands
> anyway (at least that's what I've understood from this discussion). So you
> might as well stop maintaining information which blocks we need to trim
> for that group. When the situation gets better, you can always walk block
> bitmap and issue trim commands. You might even trigger this rescan from
> kernel - if you'd maintain number of free block extents for each block group
> (which is rather easy), you could trigger the bitmap rescan and trim as soon
> as ratio number of free blocks / number of extents gets above a reasonable
> threshold.
> 
> 								Honza
> 

In what I am preparing now, I simple ignore small extents, which would
be created by splitting the deleted extent into smaller pieces by chunks
of used blocks. This, in my opinion, will prevent the fragmentation,
which otherwise may occur in the longer term (between ioctl calls).

Thanks for suggestions.
-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 17:46                       ` Lukas Czerner
@ 2010-04-26 17:52                         ` Ric Wheeler
  2010-04-26 18:14                           ` Lukas Czerner
  0 siblings, 1 reply; 53+ messages in thread
From: Ric Wheeler @ 2010-04-26 17:52 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Jan Kara, Greg Freemyer, Jeff Moyer, Eric Sandeen, Mark Lord,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

On 04/26/2010 01:46 PM, Lukas Czerner wrote:
> On Mon, 26 Apr 2010, Jan Kara wrote:
>
>>> On Wed, 21 Apr 2010, Greg Freemyer wrote:
>>> And also, currently I am rewriting the patch do use rbtree instead of the
>>> bitmap, because there were some concerns of memory consumption. It is a
>>> question whether or not the rbtree will be more memory friendly.
>>> Generally I think that in most "normal" cases it will, but there are some
>>> extreme scenarios, where the rbtree will be much worse. Any comment on
>>> this ?
>>    I see two possible improvements here:
>> a) At a cost of some code complexity, you can bound the worst case by combining
>> RB-trees with bitmaps. The basic idea is that when space to TRIM gets too
>> fragmented (memory to keep to-TRIM blocks in RB-tree for a given group exceeds
>> the memory needed to keep it in a bitmap), you convert RB-tree for a
>> problematic group to a bitmap and attach it to an appropriate RB-node. If you
>> track with a bitmap also a number of to-TRIM extents in the bitmap, you can
>> also decide whether it's benefitial to switch back to an RB-tree.
>
> This sounds like a good idea, but I wonder if it is worth it :
>   1. The tree will have very short life, because with next ioctl all
>   stored deleted extents will be trimmed and removed from the tree.
>   2. Also note, that the longer it lives the less fragmented it possibly
>   became.
>   3. I do not expect, that deleted ranges can be too fragmented, and
>   even if it is, it will be probably merged into one big extent very
>   soon.
>
>>
>> b) Another idea might be: When to-TRIM space is fragmented (again, let's say
>> in some block group), there's not much point in sending tiny trim commands
>> anyway (at least that's what I've understood from this discussion). So you
>> might as well stop maintaining information which blocks we need to trim
>> for that group. When the situation gets better, you can always walk block
>> bitmap and issue trim commands. You might even trigger this rescan from
>> kernel - if you'd maintain number of free block extents for each block group
>> (which is rather easy), you could trigger the bitmap rescan and trim as soon
>> as ratio number of free blocks / number of extents gets above a reasonable
>> threshold.
>>
>> 								Honza
>>
>
> In what I am preparing now, I simple ignore small extents, which would
> be created by splitting the deleted extent into smaller pieces by chunks
> of used blocks. This, in my opinion, will prevent the fragmentation,
> which otherwise may occur in the longer term (between ioctl calls).
>
> Thanks for suggestions.
> -Lukas

I am not convinced that ignoring small extents is a good idea. Remember that for 
SSD's specifically, they remap *everything* internally so our "fragmentation" 
set of small spaces could be useful for them.

That does not mean that we should not try to send larger requests down to the 
target device which is always a good idea I think :-)

ric


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 17:52                         ` Ric Wheeler
@ 2010-04-26 18:14                           ` Lukas Czerner
  2010-04-26 18:28                             ` Jeff Moyer
  0 siblings, 1 reply; 53+ messages in thread
From: Lukas Czerner @ 2010-04-26 18:14 UTC (permalink / raw)
  To: Ric Wheeler
  Cc: Lukas Czerner, Jan Kara, Greg Freemyer, Jeff Moyer, Eric Sandeen,
	Mark Lord, linux-ext4, Edward Shishkin, Eric Sandeen,
	Christoph Hellwig

On Mon, 26 Apr 2010, Ric Wheeler wrote:

> On 04/26/2010 01:46 PM, Lukas Czerner wrote:
> > On Mon, 26 Apr 2010, Jan Kara wrote:
> > 
> > > > On Wed, 21 Apr 2010, Greg Freemyer wrote:
> > > > And also, currently I am rewriting the patch do use rbtree instead of
> > > > the
> > > > bitmap, because there were some concerns of memory consumption. It is a
> > > > question whether or not the rbtree will be more memory friendly.
> > > > Generally I think that in most "normal" cases it will, but there are
> > > > some
> > > > extreme scenarios, where the rbtree will be much worse. Any comment on
> > > > this ?
> > >    I see two possible improvements here:
> > > a) At a cost of some code complexity, you can bound the worst case by
> > > combining
> > > RB-trees with bitmaps. The basic idea is that when space to TRIM gets too
> > > fragmented (memory to keep to-TRIM blocks in RB-tree for a given group
> > > exceeds
> > > the memory needed to keep it in a bitmap), you convert RB-tree for a
> > > problematic group to a bitmap and attach it to an appropriate RB-node. If
> > > you
> > > track with a bitmap also a number of to-TRIM extents in the bitmap, you
> > > can
> > > also decide whether it's benefitial to switch back to an RB-tree.
> > 
> > This sounds like a good idea, but I wonder if it is worth it :
> >   1. The tree will have very short life, because with next ioctl all
> >   stored deleted extents will be trimmed and removed from the tree.
> >   2. Also note, that the longer it lives the less fragmented it possibly
> >   became.
> >   3. I do not expect, that deleted ranges can be too fragmented, and
> >   even if it is, it will be probably merged into one big extent very
> >   soon.
> > 
> > > 
> > > b) Another idea might be: When to-TRIM space is fragmented (again, let's
> > > say
> > > in some block group), there's not much point in sending tiny trim commands
> > > anyway (at least that's what I've understood from this discussion). So you
> > > might as well stop maintaining information which blocks we need to trim
> > > for that group. When the situation gets better, you can always walk block
> > > bitmap and issue trim commands. You might even trigger this rescan from
> > > kernel - if you'd maintain number of free block extents for each block
> > > group
> > > (which is rather easy), you could trigger the bitmap rescan and trim as
> > > soon
> > > as ratio number of free blocks / number of extents gets above a reasonable
> > > threshold.
> > > 
> > > 								Honza
> > > 
> > 
> > In what I am preparing now, I simple ignore small extents, which would
> > be created by splitting the deleted extent into smaller pieces by chunks
> > of used blocks. This, in my opinion, will prevent the fragmentation,
> > which otherwise may occur in the longer term (between ioctl calls).
> > 
> > Thanks for suggestions.
> > -Lukas
> 
> I am not convinced that ignoring small extents is a good idea. Remember that
> for SSD's specifically, they remap *everything* internally so our
> "fragmentation" set of small spaces could be useful for them.
> 
> That does not mean that we should not try to send larger requests down to the
> target device which is always a good idea I think :-)
> 
> ric
> 

That's right, so the other approach would be probably better. Merge
small extents together into one, but there must be some limit, because I
do not want two little extents at the beginning and the end of the group
to force trimming whole group. The whole rbtree thing gets a little
complicated :)

-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 18:14                           ` Lukas Czerner
@ 2010-04-26 18:28                             ` Jeff Moyer
  0 siblings, 0 replies; 53+ messages in thread
From: Jeff Moyer @ 2010-04-26 18:28 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Ric Wheeler, Jan Kara, Greg Freemyer, Eric Sandeen, Mark Lord,
	linux-ext4, Edward Shishkin, Eric Sandeen, Christoph Hellwig

Lukas Czerner <lczerner@redhat.com> writes:

> On Mon, 26 Apr 2010, Ric Wheeler wrote:
>
>> On 04/26/2010 01:46 PM, Lukas Czerner wrote:
>> > On Mon, 26 Apr 2010, Jan Kara wrote:
>> > 
>> > > > On Wed, 21 Apr 2010, Greg Freemyer wrote:
>> > > > And also, currently I am rewriting the patch do use rbtree instead of
>> > > > the
>> > > > bitmap, because there were some concerns of memory consumption. It is a
>> > > > question whether or not the rbtree will be more memory friendly.
>> > > > Generally I think that in most "normal" cases it will, but there are
>> > > > some
>> > > > extreme scenarios, where the rbtree will be much worse. Any comment on
>> > > > this ?
>> > >    I see two possible improvements here:
>> > > a) At a cost of some code complexity, you can bound the worst case by
>> > > combining
>> > > RB-trees with bitmaps. The basic idea is that when space to TRIM gets too
>> > > fragmented (memory to keep to-TRIM blocks in RB-tree for a given group
>> > > exceeds
>> > > the memory needed to keep it in a bitmap), you convert RB-tree for a
>> > > problematic group to a bitmap and attach it to an appropriate RB-node. If
>> > > you
>> > > track with a bitmap also a number of to-TRIM extents in the bitmap, you
>> > > can
>> > > also decide whether it's benefitial to switch back to an RB-tree.
>> > 
>> > This sounds like a good idea, but I wonder if it is worth it :
>> >   1. The tree will have very short life, because with next ioctl all
>> >   stored deleted extents will be trimmed and removed from the tree.
>> >   2. Also note, that the longer it lives the less fragmented it possibly
>> >   became.
>> >   3. I do not expect, that deleted ranges can be too fragmented, and
>> >   even if it is, it will be probably merged into one big extent very
>> >   soon.
>> > 
>> > > 
>> > > b) Another idea might be: When to-TRIM space is fragmented (again, let's
>> > > say
>> > > in some block group), there's not much point in sending tiny trim commands
>> > > anyway (at least that's what I've understood from this discussion). So you
>> > > might as well stop maintaining information which blocks we need to trim
>> > > for that group. When the situation gets better, you can always walk block
>> > > bitmap and issue trim commands. You might even trigger this rescan from
>> > > kernel - if you'd maintain number of free block extents for each block
>> > > group
>> > > (which is rather easy), you could trigger the bitmap rescan and trim as
>> > > soon
>> > > as ratio number of free blocks / number of extents gets above a reasonable
>> > > threshold.
>> > > 
>> > > 								Honza
>> > > 
>> > 
>> > In what I am preparing now, I simple ignore small extents, which would
>> > be created by splitting the deleted extent into smaller pieces by chunks
>> > of used blocks. This, in my opinion, will prevent the fragmentation,
>> > which otherwise may occur in the longer term (between ioctl calls).
>> > 
>> > Thanks for suggestions.
>> > -Lukas
>> 
>> I am not convinced that ignoring small extents is a good idea. Remember that
>> for SSD's specifically, they remap *everything* internally so our
>> "fragmentation" set of small spaces could be useful for them.
>> 
>> That does not mean that we should not try to send larger requests down to the
>> target device which is always a good idea I think :-)
>> 
>> ric
>> 
>
> That's right, so the other approach would be probably better. Merge
> small extents together into one, but there must be some limit, because I
> do not want two little extents at the beginning and the end of the group
> to force trimming whole group. The whole rbtree thing gets a little
> complicated :)

This discussion is getting a bit too abstract for me.  Show us the code
and we can make some progress.  =)

On the topic of discarding small blocks, I agree with Ric, it should be
done.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4.
  2010-04-26 15:27                                         ` Greg Freemyer
  2010-04-26 15:51                                           ` Lukas Czerner
@ 2010-04-28  1:25                                           ` Mark Lord
  1 sibling, 0 replies; 53+ messages in thread
From: Mark Lord @ 2010-04-28  1:25 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Martin K. Petersen, Ric Wheeler, Eric Sandeen, Lukas Czerner,
	Jeff Moyer, linux-ext4, Edward Shishkin, Eric Sandeen,
	Christoph Hellwig

On 26/04/10 11:27 AM, Greg Freemyer wrote:
..
> Mark, or anyone, do you think it would be a bad idea for me to push
> for Opensuse 11.3 (2.6.34 based) to use wiper.sh as a standard ext4
> discard tool?  hdparm and wiper.sh are already in the distro, it just
> needs a cron entry and maybe some basic management infrastructure.
> They're at feature freeze for 11.3, so I don't know if I can get it in
> or not.
..

I'd hold off for now.  Rumour has it that Intel SSDs are not 100% compliant
with TRIM -- they fail with large amounts of range data.

I don't have an Intel SSD to verify that or fix it with.

Cheers

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Ext4: batched discard support - simplified version
@ 2010-07-07  7:53 Lukas Czerner
  2010-07-07  7:53 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
                   ` (2 more replies)
  0 siblings, 3 replies; 53+ messages in thread
From: Lukas Czerner @ 2010-07-07  7:53 UTC (permalink / raw)
  To: eshishki; +Cc: lczerner, jmoyer, rwheeler, linux-ext4, sandeen

Hi all,

since my last post I have done some more testing with various SSD's and the
trend is clear. Trim performance is getting better and the performance loss
without trim is getting lower. So I have decided to abandon the initial idea
to track free blocks within some internal data structure - it takes time and
memory.

Today there are some SSD's which performance does not seems to degrade over
the time (writes). I have filled those devices up to 200% and still did not
seen any performance loss. On the other hand, there are still some devices
which shows about 300% performance degradation, so I suppose TRIM will be
still needed for some time.

You can try it out with the simple program attached below. Just create ext4
fs, mount it with -o discard and invoke attached program on ext4 mount point.

>From my experience the time needed to trim whole file system is strongly device
dependent. It may take few seconds on one device up to one minute on another,
under the heavy io load the time to trim whole fs gets longer.

There are two pathes:

[PATCH 1/2] Add ioctl FITRIM.
 fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
 include/linux/fs.h |    2 ++
 2 files changed, 33 insertions(+), 0 deletions(-)

[PATCH 2/2] Add batched discard support for ext4
 fs/ext4/ext4.h    |    2 +
 fs/ext4/mballoc.c |  126 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/super.c   |    1 +
 3 files changed, 118 insertions(+), 11 deletions(-)

Signed-off-by: "Lukas Czerner" <lczerner@redhat.com>

#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/ioctl.h>

#define FITRIM		_IOWR('X', 121, int)

int main(int argc, char **argv)
{
	int minsize = 4096;
	int fd;

	if (argc != 2) {
		fprintf(stderr, "usage: %s mountpoint\n", argv[0]);
		return 1;
	}

	fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		perror("open");
		return 1;
	}

	if (ioctl(fd, FITRIM, &minsize)) {
		if (errno == EOPNOTSUPP)
			fprintf(stderr, "TRIM not supported\n");
		else
			perror("EXT4_IOC_TRIM");
		return 1;
	}

	return 0;
}

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 1/2] Add ioctl FITRIM.
  2010-07-07  7:53 Ext4: batched discard support - simplified version Lukas Czerner
@ 2010-07-07  7:53 ` Lukas Czerner
  2010-07-07  7:53 ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
  2010-07-23 14:36 ` Ext4: batched discard support - simplified version Ted Ts'o
  2 siblings, 0 replies; 53+ messages in thread
From: Lukas Czerner @ 2010-07-07  7:53 UTC (permalink / raw)
  To: eshishki; +Cc: lczerner, jmoyer, rwheeler, linux-ext4, sandeen

Adds an filesystem independent ioctl to allow implementation of file
system batched discard support.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
 include/linux/fs.h |    2 ++
 2 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4..09b33ae 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -551,6 +551,33 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_bdev(sb->s_bdev, sb);
 }
 
+static int ioctl_fstrim(struct file *filp, unsigned long arg)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	unsigned int minlen;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support trim feature, return. */
+	if (sb->s_op->trim_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	err = get_user(minlen, (unsigned int __user *) arg);
+	if (err)
+		return err;
+
+	err = sb->s_op->trim_fs(minlen, sb);
+	if (err)
+		return err;
+	return 0;
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -601,6 +628,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FITRIM:
+		error = ioctl_fstrim(filp, arg);
+		break;
+
 	case FS_IOC_FIEMAP:
 		return ioctl_fiemap(filp, arg);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44f35ae..7a27fa4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ struct inodes_stat_t {
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
 #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
+#define FITRIM		_IOWR('X', 121, int)	/* Trim */
 
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
@@ -1580,6 +1581,7 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*trim_fs) (unsigned int, struct super_block *);
 };
 
 /*
-- 
1.6.6.1


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 2/2] Add batched discard support for ext4
  2010-07-07  7:53 Ext4: batched discard support - simplified version Lukas Czerner
  2010-07-07  7:53 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
@ 2010-07-07  7:53 ` Lukas Czerner
  2010-07-14  8:33   ` Dmitry Monakhov
  2010-07-23 14:36 ` Ext4: batched discard support - simplified version Ted Ts'o
  2 siblings, 1 reply; 53+ messages in thread
From: Lukas Czerner @ 2010-07-07  7:53 UTC (permalink / raw)
  To: eshishki; +Cc: lczerner, jmoyer, rwheeler, linux-ext4, sandeen

Walk through each allocation group and trim all free extents. It can be
invoked through TRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search fro free extents in each allocation group. When the free
extent is found, blocks are marked as used and then trimmed. Afterwards
these blocks are marked as free in per-group bitmap.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/mballoc.c |  126 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/super.c   |    1 +
 3 files changed, 118 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf..ba0fff0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(unsigned int, struct super_block *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a36..c7b541c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2535,17 +2535,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
-		if (test_opt(sb, DISCARD)) {
-			ext4_fsblk_t discard_block;
-
-			discard_block = entry->start_blk +
-				ext4_group_first_block_no(sb, entry->group);
-			trace_ext4_discard_blocks(sb,
-					(unsigned long long)discard_block,
-					entry->count);
-			sb_issue_discard(sb, discard_block, entry->count);
-		}
-
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
 		BUG_ON(err != 0);
@@ -4640,3 +4629,118 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * Trim "count" blocks starting at "start" in "group"
+ * This must be called under group lock
+ */
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_free_extent ex;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/**
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	sb_issue_discard(sb, discard_block, count);
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+}
+
+/**
+ * Trim all free extents in group at least minblocks long
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t start, next, count = 0;
+	ext4_group_t group;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = e4b->bd_info->bb_first_free;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			count += next - start;
+			ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+		}
+		start = next + 1;
+
+		if ((e4b->bd_info->bb_free - count) < minblocks)
+			break;
+	}
+
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	return count;
+}
+
+int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
+{
+	struct ext4_buddy e4b;
+	ext4_group_t group;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t minblocks;
+
+	if (!test_opt(sb, DISCARD))
+		return 0;
+
+	minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
+	if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	for (group = 0; group < ngroups; group++) {
+		int err;
+
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			continue;
+		}
+
+		if (e4b.bd_info->bb_free >= minblocks) {
+			ext4_trim_all_free(sb, &e4b, minblocks);
+		}
+
+		ext4_mb_release_desc(&e4b);
+	}
+
+	return 0;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c..253eb98 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.6.6.1


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4
  2010-07-07  7:53 ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
@ 2010-07-14  8:33   ` Dmitry Monakhov
  2010-07-14  9:40     ` Lukas Czerner
  0 siblings, 1 reply; 53+ messages in thread
From: Dmitry Monakhov @ 2010-07-14  8:33 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: eshishki, jmoyer, rwheeler, linux-ext4, sandeen

[-- Attachment #1: Type: text/plain, Size: 1086 bytes --]

Lukas Czerner <lczerner@redhat.com> writes:

> Walk through each allocation group and trim all free extents. It can be
> invoked through TRIM ioctl on the file system. The main idea is to
> provide a way to trim the whole file system if needed, since some SSD's
> may suffer from performance loss after the whole device was filled (it
> does not mean that fs is full!).
>
> It search fro free extents in each allocation group. When the free
> extent is found, blocks are marked as used and then trimmed. Afterwards
> these blocks are marked as free in per-group bitmap.
Looks ok, except two small notes:
1) trim_fs is a time consuming operation and we have to add
   condresced, and signal_pending checks to allow user to interrupt
   cmd if necessery. See patch attached.
2) IMHO runtime trim support is useful sometimes, for example when
   user really care about data security i.e. unlinked file should be
   trimmed ASAP. I think we have to provide 'secdel' mount option
   similar to secdeletion flag for inode, but this is another story
   not directly connected with the patch.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-ext4-Add-interrupt-points-to-batched-discard.patch --]
[-- Type: text/x-diff, Size: 2230 bytes --]

>From 54ca2add544f88ac9ca8c647ae7b093be9ac2872 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 14 Jul 2010 12:16:50 +0400
Subject: [PATCH] ext4: Add interrupt points to batched discard

Since fstrim is a long operation it will be good to have an
ability to interrupt it by a signal.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/mballoc.c |   27 +++++++++++++++++----------
 1 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 48abd3d..0948408 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3959,7 +3959,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
 			(unsigned long long)discard_block,
 			count);
 	sb_issue_discard(sb, discard_block, count);
-
+	cond_resched();
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
 }
@@ -3995,7 +3995,10 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 				next - start, group, e4b);
 		}
 		start = next + 1;
-
+		if (signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
 		if ((e4b->bd_info->bb_free - count) < minblocks)
 			break;
 	}
@@ -4013,7 +4016,8 @@ int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
 	struct ext4_buddy e4b;
 	ext4_group_t group;
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	ext4_grpblk_t minblocks;
+	ext4_grpblk_t minblocks, cnt;
+	int ret = 0;
 
 	if (!test_opt(sb, DISCARD))
 		return 0;
@@ -4023,22 +4027,25 @@ int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
 		return -EINVAL;
 
 	for (group = 0; group < ngroups; group++) {
-		int err;
-
-		err = ext4_mb_load_buddy(sb, group, &e4b);
-		if (err) {
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
 			ext4_error(sb, "Error in loading buddy "
 					"information for %u", group);
-			continue;
+			break;
 		}
 
 		if (e4b.bd_info->bb_free >= minblocks) {
-			ext4_trim_all_free(sb, &e4b, minblocks);
+			cnt = ext4_trim_all_free(sb, &e4b, minblocks);
+			if (cnt < 0) {
+				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
+				break;
+			}
 		}
 		ext4_mb_unload_buddy(&e4b);
 	}
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
1.6.6.1


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4
  2010-07-14  8:33   ` Dmitry Monakhov
@ 2010-07-14  9:40     ` Lukas Czerner
  2010-07-14 10:03       ` Dmitry Monakhov
  0 siblings, 1 reply; 53+ messages in thread
From: Lukas Czerner @ 2010-07-14  9:40 UTC (permalink / raw)
  To: Dmitry Monakhov
  Cc: Lukas Czerner, eshishki, jmoyer, rwheeler, linux-ext4, sandeen

On Wed, 14 Jul 2010, Dmitry Monakhov wrote:

> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > Walk through each allocation group and trim all free extents. It can be
> > invoked through TRIM ioctl on the file system. The main idea is to
> > provide a way to trim the whole file system if needed, since some SSD's
> > may suffer from performance loss after the whole device was filled (it
> > does not mean that fs is full!).
> >
> > It search fro free extents in each allocation group. When the free
> > extent is found, blocks are marked as used and then trimmed. Afterwards
> > these blocks are marked as free in per-group bitmap.
> Looks ok, except two small notes:
> 1) trim_fs is a time consuming operation and we have to add
>    condresced, and signal_pending checks to allow user to interrupt
>    cmd if necessery. See patch attached.

Hi, Dimitry

thanks for your patch! Although I have one question:


 	for (group = 0; group < ngroups; group++) {
-		int err;
-
-		err = ext4_mb_load_buddy(sb, group, &e4b);
-		if (err) {
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
 			ext4_error(sb, "Error in loading buddy "
 					"information for %u", group);
-			continue;
+			break;
 		}

Is there really need to jump out from the loop and exit in the case of
load_buddy failure ? Next group may very well succeed in loading buddy,
or am I missing something ?

> 2) IMHO runtime trim support is useful sometimes, for example when
>    user really care about data security i.e. unlinked file should be
>    trimmed ASAP. I think we have to provide 'secdel' mount option
>    similar to secdeletion flag for inode, but this is another story
>    not directly connected with the patch.

I like the idea, but IMO this should work for any underlying storage,
not just for SSDs.

Thanks!
-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4
  2010-07-14  9:40     ` Lukas Czerner
@ 2010-07-14 10:03       ` Dmitry Monakhov
  2010-07-14 11:43         ` Lukas Czerner
  0 siblings, 1 reply; 53+ messages in thread
From: Dmitry Monakhov @ 2010-07-14 10:03 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: eshishki, jmoyer, rwheeler, linux-ext4, sandeen

Lukas Czerner <lczerner@redhat.com> writes:

> On Wed, 14 Jul 2010, Dmitry Monakhov wrote:
>
>> Lukas Czerner <lczerner@redhat.com> writes:
>> 
>> > Walk through each allocation group and trim all free extents. It can be
>> > invoked through TRIM ioctl on the file system. The main idea is to
>> > provide a way to trim the whole file system if needed, since some SSD's
>> > may suffer from performance loss after the whole device was filled (it
>> > does not mean that fs is full!).
>> >
>> > It search fro free extents in each allocation group. When the free
>> > extent is found, blocks are marked as used and then trimmed. Afterwards
>> > these blocks are marked as free in per-group bitmap.
>> Looks ok, except two small notes:
>> 1) trim_fs is a time consuming operation and we have to add
>>    condresced, and signal_pending checks to allow user to interrupt
>>    cmd if necessery. See patch attached.
>
> Hi, Dimitry
>
> thanks for your patch! Although I have one question:
>
>
>  	for (group = 0; group < ngroups; group++) {
> -		int err;
> -
> -		err = ext4_mb_load_buddy(sb, group, &e4b);
> -		if (err) {
> +		ret = ext4_mb_load_buddy(sb, group, &e4b);
> +		if (ret) {
>  			ext4_error(sb, "Error in loading buddy "
>  					"information for %u", group);
> -			continue;
> +			break;
>  		}
>
> Is there really need to jump out from the loop and exit in the case of
> load_buddy failure ? Next group may very well succeed in loading buddy,
> or am I missing something ?
Well, it may fail due to -ENOMEM which is not scary but in some places
it may fail due to EIO which is a very bad sign. So i think it is
slightly dangerous to continue if we have found a same group.
>
>> 2) IMHO runtime trim support is useful sometimes, for example when
>>    user really care about data security i.e. unlinked file should be
>>    trimmed ASAP. I think we have to provide 'secdel' mount option
>>    similar to secdeletion flag for inode, but this is another story
>>    not directly connected with the patch.
>
> I like the idea, but IMO this should work for any underlying storage,
> not just for SSDs.
Off course. We may use blkdev_issue_zeroout() if disk does not support
discard with zeroing.
>
> Thanks!
> -Lukas
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 2/2] Add batched discard support for ext4
  2010-07-14 10:03       ` Dmitry Monakhov
@ 2010-07-14 11:43         ` Lukas Czerner
  0 siblings, 0 replies; 53+ messages in thread
From: Lukas Czerner @ 2010-07-14 11:43 UTC (permalink / raw)
  To: Dmitry Monakhov
  Cc: Lukas Czerner, eshishki, jmoyer, rwheeler, linux-ext4, sandeen

On Wed, 14 Jul 2010, Dmitry Monakhov wrote:

> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > On Wed, 14 Jul 2010, Dmitry Monakhov wrote:
> >
> >> Lukas Czerner <lczerner@redhat.com> writes:
> >> 
> >> > Walk through each allocation group and trim all free extents. It can be
> >> > invoked through TRIM ioctl on the file system. The main idea is to
> >> > provide a way to trim the whole file system if needed, since some SSD's
> >> > may suffer from performance loss after the whole device was filled (it
> >> > does not mean that fs is full!).
> >> >
> >> > It search fro free extents in each allocation group. When the free
> >> > extent is found, blocks are marked as used and then trimmed. Afterwards
> >> > these blocks are marked as free in per-group bitmap.
> >> Looks ok, except two small notes:
> >> 1) trim_fs is a time consuming operation and we have to add
> >>    condresced, and signal_pending checks to allow user to interrupt
> >>    cmd if necessery. See patch attached.
> >
> > Hi, Dimitry
> >
> > thanks for your patch! Although I have one question:
> >
> >
> >  	for (group = 0; group < ngroups; group++) {
> > -		int err;
> > -
> > -		err = ext4_mb_load_buddy(sb, group, &e4b);
> > -		if (err) {
> > +		ret = ext4_mb_load_buddy(sb, group, &e4b);
> > +		if (ret) {
> >  			ext4_error(sb, "Error in loading buddy "
> >  					"information for %u", group);
> > -			continue;
> > +			break;
> >  		}
> >
> > Is there really need to jump out from the loop and exit in the case of
> > load_buddy failure ? Next group may very well succeed in loading buddy,
> > or am I missing something ?
> Well, it may fail due to -ENOMEM which is not scary but in some places
> it may fail due to EIO which is a very bad sign. So i think it is
> slightly dangerous to continue if we have found a same group.

Ok, it seems reasonable.

> >
> >> 2) IMHO runtime trim support is useful sometimes, for example when
> >>    user really care about data security i.e. unlinked file should be
> >>    trimmed ASAP. I think we have to provide 'secdel' mount option
> >>    similar to secdeletion flag for inode, but this is another story
> >>    not directly connected with the patch.
> >
> > I like the idea, but IMO this should work for any underlying storage,
> > not just for SSDs.
> Off course. We may use blkdev_issue_zeroout() if disk does not support
> discard with zeroing.
> >

-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-07  7:53 Ext4: batched discard support - simplified version Lukas Czerner
  2010-07-07  7:53 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
  2010-07-07  7:53 ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
@ 2010-07-23 14:36 ` Ted Ts'o
  2010-07-23 15:13   ` Jeff Moyer
  2010-07-26 10:30   ` Lukas Czerner
  2 siblings, 2 replies; 53+ messages in thread
From: Ted Ts'o @ 2010-07-23 14:36 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: eshishki, jmoyer, rwheeler, linux-ext4, sandeen

On Wed, Jul 07, 2010 at 09:53:30AM +0200, Lukas Czerner wrote:
> 
> Hi all,
> 
> since my last post I have done some more testing with various SSD's and the
> trend is clear. Trim performance is getting better and the performance loss
> without trim is getting lower. So I have decided to abandon the initial idea
> to track free blocks within some internal data structure - it takes time and
> memory.

Do you have some numbers about how bad trim actually might be on
various devices?  I can imagine some devices where it might be better
(for wear levelling and better write endurance if nothing else) where
it's better to do the trim right away instead of batching things.

So what I'm thinking about doing is keeping the "discard" mount option
to mean non-batched discard.  If you want to use the explicit FITRIM
ioctl, I don't think we need to test to see if the dicard mount option
is set; if the user issues the ioctl, then we should do the batched
discard, and if we don't trust the user to do that, then well, the
ioctl should be restricted to privileged users only --- especially if
it could take up to a minute.

						- Ted

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 14:36 ` Ext4: batched discard support - simplified version Ted Ts'o
@ 2010-07-23 15:13   ` Jeff Moyer
  2010-07-23 15:19     ` Ted Ts'o
  2010-07-23 15:30     ` Greg Freemyer
  2010-07-26 10:30   ` Lukas Czerner
  1 sibling, 2 replies; 53+ messages in thread
From: Jeff Moyer @ 2010-07-23 15:13 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Lukas Czerner, eshishki, rwheeler, linux-ext4, sandeen

"Ted Ts'o" <tytso@mit.edu> writes:

> On Wed, Jul 07, 2010 at 09:53:30AM +0200, Lukas Czerner wrote:
>> 
>> Hi all,
>> 
>> since my last post I have done some more testing with various SSD's and the
>> trend is clear. Trim performance is getting better and the performance loss
>> without trim is getting lower. So I have decided to abandon the initial idea
>> to track free blocks within some internal data structure - it takes time and
>> memory.
>
> Do you have some numbers about how bad trim actually might be on
> various devices?

I'll let Lukas answer that when he gets back to the office next week.
The performance of the trim command itself varies by vendor, of course.

> I can imagine some devices where it might be better (for wear
> levelling and better write endurance if nothing else) where it's
> better to do the trim right away instead of batching things.

I don't think so.  In all of the configurations tested, I'm pretty sure
we saw a performance hit from doing the TRIMs right away.  The queue
flush really hurts.  Of course, I have no idea what you had in mind for
the amount of time in between batched discards.

> So what I'm thinking about doing is keeping the "discard" mount option
> to mean non-batched discard.  If you want to use the explicit FITRIM
> ioctl, I don't think we need to test to see if the dicard mount option
> is set; if the user issues the ioctl, then we should do the batched
> discard, and if we don't trust the user to do that, then well, the
> ioctl should be restricted to privileged users only --- especially if
> it could take up to a minute.

That sounds reasonable to me.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 15:13   ` Jeff Moyer
@ 2010-07-23 15:19     ` Ted Ts'o
  2010-07-23 15:40       ` Jeff Moyer
  2010-07-24 16:31       ` Ric Wheeler
  2010-07-23 15:30     ` Greg Freemyer
  1 sibling, 2 replies; 53+ messages in thread
From: Ted Ts'o @ 2010-07-23 15:19 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Lukas Czerner, eshishki, rwheeler, linux-ext4, sandeen

On Fri, Jul 23, 2010 at 11:13:52AM -0400, Jeff Moyer wrote:
> 
> I don't think so.  In all of the configurations tested, I'm pretty sure
> we saw a performance hit from doing the TRIMs right away.  The queue
> flush really hurts.  Of course, I have no idea what you had in mind for
> the amount of time in between batched discards.

Sure, but not all the world is SATA-attached SSD's.  I'm thinking in
particular of PCIe-attached SSD's, where the TRIM command might be
very fast indeed...  I believe Ric Wheeler tells me you have TMS
RamSan SSD's in house that you are testing?  And of course those
aren't the only PCIe-attached flash devices out there....

       	   		      	    	    	- Ted

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 15:13   ` Jeff Moyer
  2010-07-23 15:19     ` Ted Ts'o
@ 2010-07-23 15:30     ` Greg Freemyer
  1 sibling, 0 replies; 53+ messages in thread
From: Greg Freemyer @ 2010-07-23 15:30 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Ted Ts'o, Lukas Czerner, eshishki, rwheeler, linux-ext4,
	sandeen

On Fri, Jul 23, 2010 at 11:13 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
> "Ted Ts'o" <tytso@mit.edu> writes:
>
>> On Wed, Jul 07, 2010 at 09:53:30AM +0200, Lukas Czerner wrote:
>>>
>>> Hi all,
>>>
>>> since my last post I have done some more testing with various SSD's and the
>>> trend is clear. Trim performance is getting better and the performance loss
>>> without trim is getting lower. So I have decided to abandon the initial idea
>>> to track free blocks within some internal data structure - it takes time and
>>> memory.
>>
>> Do you have some numbers about how bad trim actually might be on
>> various devices?
>
> I'll let Lukas answer that when he gets back to the office next week.
> The performance of the trim command itself varies by vendor, of course.
>
>> I can imagine some devices where it might be better (for wear
>> levelling and better write endurance if nothing else) where it's
>> better to do the trim right away instead of batching things.
>
> I don't think so.  In all of the configurations tested, I'm pretty sure
> we saw a performance hit from doing the TRIMs right away.  The queue
> flush really hurts.  Of course, I have no idea what you had in mind for
> the amount of time in between batched discards.

It was my understanding that way back when, Intel was pushing for the
TRIMs to be right away.  That may be why they never fully implemented
the TRIM command to accept more than one sectors worth of vectorized
data.  (That is still multiple ranges per discard, just not
hundreds/thousands.)

Along those lines, does this patch create multi-sector discard trim
commands when there is a large list of discard ranges (ie. thousands
of ranges to discard)?  And if so, does it have a blacklist for SSDs
like the Intel that don't implement the multi-sector payload part of
the spec?

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 15:19     ` Ted Ts'o
@ 2010-07-23 15:40       ` Jeff Moyer
  2010-07-23 17:00         ` Ted Ts'o
  2010-07-24 16:31       ` Ric Wheeler
  1 sibling, 1 reply; 53+ messages in thread
From: Jeff Moyer @ 2010-07-23 15:40 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Lukas Czerner, eshishki, rwheeler, linux-ext4, sandeen

"Ted Ts'o" <tytso@mit.edu> writes:

> On Fri, Jul 23, 2010 at 11:13:52AM -0400, Jeff Moyer wrote:
>> 
>> I don't think so.  In all of the configurations tested, I'm pretty sure
>> we saw a performance hit from doing the TRIMs right away.  The queue
>> flush really hurts.  Of course, I have no idea what you had in mind for
>> the amount of time in between batched discards.
>
> Sure, but not all the world is SATA-attached SSD's.  I'm thinking in
> particular of PCIe-attached SSD's, where the TRIM command might be
> very fast indeed...  I believe Ric Wheeler tells me you have TMS
> RamSan SSD's in house that you are testing?  And of course those
> aren't the only PCIe-attached flash devices out there....

You are right, and we have to consider thinly provisioned luns, as well.
The only case I can think of where it makes sense to issue those
discards immediately is if you are running tight on allocated space in
your thinly provisioned lun.  Aside from that, I'm not sure why you
would want to send those commands down with every journal commit,
instead of batched daily, for example.  But, I can certainly understand
wanting to allow this flexibility.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 15:40       ` Jeff Moyer
@ 2010-07-23 17:00         ` Ted Ts'o
  0 siblings, 0 replies; 53+ messages in thread
From: Ted Ts'o @ 2010-07-23 17:00 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Lukas Czerner, eshishki, rwheeler, linux-ext4, sandeen

On Fri, Jul 23, 2010 at 11:40:58AM -0400, Jeff Moyer wrote:
> 
> You are right, and we have to consider thinly provisioned luns, as well.
> The only case I can think of where it makes sense to issue those
> discards immediately is if you are running tight on allocated space in
> your thinly provisioned lun.  Aside from that, I'm not sure why you
> would want to send those commands down with every journal commit,
> instead of batched daily, for example.  But, I can certainly understand
> wanting to allow this flexibility.

The two reasons I could imagine is to give more flexibility to the
wear leveling algorithms (depending on how often you are turning over
files --- i.e., deleting blocks and then reusing them), and to
minimize latency (it might be nicer for the system to send down the
deleted blocks on a continuing basis rather than to send them down all
at once).  

The other issue is that by sending TRIM commands for all free extents,
even those that haven't been recently been released, the flash
translation layer needs to look up a large number of blocks in its
translation table to see if it needs to update it.  This can end up
burning CPU unnecessarily, especially for those flash devices (such as
FusionIO, for example) manage their FTL using the host CPU.

So this is one of the reasons why I want to leave some flexibility
here; BTW, for some systems, it may make sense for the FITRIM ioctl to
throttle the rate at which it locks block groups and sends down TRIM
requests so it doesn't end up causing performance hiccups for live
applications while the FITRIM ioctl is running.

	     	     	       	      	       - Ted

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 15:19     ` Ted Ts'o
  2010-07-23 15:40       ` Jeff Moyer
@ 2010-07-24 16:31       ` Ric Wheeler
  1 sibling, 0 replies; 53+ messages in thread
From: Ric Wheeler @ 2010-07-24 16:31 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Jeff Moyer, Lukas Czerner, eshishki, linux-ext4, sandeen

On 07/23/2010 11:19 AM, Ted Ts'o wrote:
> On Fri, Jul 23, 2010 at 11:13:52AM -0400, Jeff Moyer wrote:
>    
>> I don't think so.  In all of the configurations tested, I'm pretty sure
>> we saw a performance hit from doing the TRIMs right away.  The queue
>> flush really hurts.  Of course, I have no idea what you had in mind for
>> the amount of time in between batched discards.
>>      
> Sure, but not all the world is SATA-attached SSD's.  I'm thinking in
> particular of PCIe-attached SSD's, where the TRIM command might be
> very fast indeed...  I believe Ric Wheeler tells me you have TMS
> RamSan SSD's in house that you are testing?  And of course those
> aren't the only PCIe-attached flash devices out there....
>
>         	   		      	    	    	- Ted
>    

I think that some of the PCI-e cards might want that information right 
away, a lot of high end arrays actually prefer fewer larger chunks.

One other user might be virtual devices or some remote replication 
mechanism. I wonder if the drbd people for example might (do?) use these?

ric


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: Ext4: batched discard support - simplified version
  2010-07-23 14:36 ` Ext4: batched discard support - simplified version Ted Ts'o
  2010-07-23 15:13   ` Jeff Moyer
@ 2010-07-26 10:30   ` Lukas Czerner
  1 sibling, 0 replies; 53+ messages in thread
From: Lukas Czerner @ 2010-07-26 10:30 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: Lukas Czerner, eshishki, jmoyer, rwheeler, linux-ext4, sandeen

On Fri, 23 Jul 2010, Ted Ts'o wrote:

> On Wed, Jul 07, 2010 at 09:53:30AM +0200, Lukas Czerner wrote:
> > 
> > Hi all,
> > 
> > since my last post I have done some more testing with various SSD's and the
> > trend is clear. Trim performance is getting better and the performance loss
> > without trim is getting lower. So I have decided to abandon the initial idea
> > to track free blocks within some internal data structure - it takes time and
> > memory.
> 
> Do you have some numbers about how bad trim actually might be on
> various devices?  I can imagine some devices where it might be better
> (for wear levelling and better write endurance if nothing else) where
> it's better to do the trim right away instead of batching things.

Hi,

Yes, I have those numbers.

http://people.redhat.com/jmoyer/discard/ext4_batched_discard/ext4_discard.html
This page presents my test results on three different devices. I have
tested the current ext4 discard implementation (do the trim right away).
However, one tested device is still not on that page. With this
(Vendor4) device I have got only about 1.83% performance loss, which is
very good.

http://people.redhat.com/jmoyer/discard/ext4_batched_discard/ext4_ioctltrim.html
This page provides test results with my batched discard implementation.
Take those numbers with discretion, because the patch still does not
involve journaling and I have tested the "worst case" scenario, which
involves issuing FITRIM in endless loop without any sleep.

Generally the FITRIM ioctl can take from 2 seconds on fast devices to
several (2-4) minutes on very slow devices, or under heavy load.

> 
> So what I'm thinking about doing is keeping the "discard" mount option
> to mean non-batched discard.  If you want to use the explicit FITRIM
> ioctl, I don't think we need to test to see if the dicard mount option
> is set; if the user issues the ioctl, then we should do the batched
> discard, and if we don't trust the user to do that, then well, the
> ioctl should be restricted to privileged users only --- especially if
> it could take up to a minute.

I agree.

> 
> 						- Ted
> 

Thanks.

-Lukas

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2010-07-26 10:30 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-07-07  7:53 Ext4: batched discard support - simplified version Lukas Czerner
2010-07-07  7:53 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
2010-07-07  7:53 ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
2010-07-14  8:33   ` Dmitry Monakhov
2010-07-14  9:40     ` Lukas Czerner
2010-07-14 10:03       ` Dmitry Monakhov
2010-07-14 11:43         ` Lukas Czerner
2010-07-23 14:36 ` Ext4: batched discard support - simplified version Ted Ts'o
2010-07-23 15:13   ` Jeff Moyer
2010-07-23 15:19     ` Ted Ts'o
2010-07-23 15:40       ` Jeff Moyer
2010-07-23 17:00         ` Ted Ts'o
2010-07-24 16:31       ` Ric Wheeler
2010-07-23 15:30     ` Greg Freemyer
2010-07-26 10:30   ` Lukas Czerner
  -- strict thread matches above, loose matches on Subject: below --
2010-04-19 10:55 Ext4: batched discard support Lukas Czerner
2010-04-19 10:55 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
2010-04-19 10:55   ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
2010-04-20 21:21     ` Greg Freemyer
2010-04-21  2:26       ` Mark Lord
2010-04-21  2:45         ` Eric Sandeen
2010-04-21 18:59           ` Greg Freemyer
2010-04-21 19:04             ` Ric Wheeler
2010-04-21 19:22               ` Jeff Moyer
2010-04-21 20:44                 ` Greg Freemyer
2010-04-21 20:53                   ` Greg Freemyer
2010-04-21 21:01                   ` Eric Sandeen
2010-04-21 21:03                     ` Ric Wheeler
2010-04-21 21:47                       ` Greg Freemyer
2010-04-21 21:56                         ` James Bottomley
2010-04-21 21:59                         ` Mark Lord
2010-04-23  8:23                   ` Lukas Czerner
2010-04-24 13:24                     ` Greg Freemyer
2010-04-24 13:48                       ` Ric Wheeler
2010-04-24 14:30                         ` Greg Freemyer
2010-04-24 14:43                           ` Eric Sandeen
2010-04-24 15:03                             ` Greg Freemyer
2010-04-24 17:04                               ` Ric Wheeler
2010-04-24 18:30                                 ` Greg Freemyer
2010-04-24 18:41                                   ` Ric Wheeler
2010-04-26 14:00                                     ` Mark Lord
2010-04-26 14:42                                       ` Martin K. Petersen
2010-04-26 15:27                                         ` Greg Freemyer
2010-04-26 15:51                                           ` Lukas Czerner
2010-04-28  1:25                                           ` Mark Lord
2010-04-26 15:48                                         ` Ric Wheeler
2010-04-24 19:06                                   ` Martin K. Petersen
2010-04-26 14:03                                     ` Mark Lord
2010-04-24 18:39                       ` Martin K. Petersen
2010-04-26 16:55                     ` Jan Kara
2010-04-26 17:46                       ` Lukas Czerner
2010-04-26 17:52                         ` Ric Wheeler
2010-04-26 18:14                           ` Lukas Czerner
2010-04-26 18:28                             ` Jeff Moyer
2010-04-21 20:52                 ` Greg Freemyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).