Re: [PATCH 2/2] Add batched discard support for ext4.

linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Greg Freemyer <greg.freemyer@gmail.com>
To: Lukas Czerner <lczerner@redhat.com>
Cc: linux-ext4@vger.kernel.org, Jeff Moyer <jmoyer@redhat.com>,
	Edward Shishkin <eshishki@redhat.com>,
	Eric Sandeen <esandeen@redhat.com>,
	Ric Wheeler <rwheeler@redhat.com>,
	Mark Lord <kernel@teksavvy.com>
Subject: Re: [PATCH 2/2] Add batched discard support for ext4.
Date: Tue, 20 Apr 2010 17:21:03 -0400	[thread overview]
Message-ID: <r2k87f94c371004201421se796c0ekbae28495e62a9eb7@mail.gmail.com> (raw)
In-Reply-To: <1271674527-2977-3-git-send-email-lczerner@redhat.com>

Mark,

This is the patch implementing the new discard logic.

You did the benchmarking last year, but I thought you found calling
trim one contiguous sector range at a time was too inefficient.

See my highlight below:

On Mon, Apr 19, 2010 at 6:55 AM, Lukas Czerner <lczerner@redhat.com> wrote:
> Create an ioctl which walks through all the free extents in each
> allocating group and discard those extents. As an addition to improve
> its performance one can specify minimum free extent length, so ioctl
> will not bother with shorter extents.
>
> This of course means, that with each invocation the ioctl must walk
> through whole file system, checking and discarding free extents, which
> is not very efficient. The best way to avoid this is to keep track of
> deleted (freed) blocks. Then the ioctl have to trim just those free
> extents which were recently freed.
>
> In order to implement this I have added new bitmap into ext4_group_info
> (bb_bitmap_deleted) which stores recently freed blocks. The ioctl then
> walk through bb_bitmap_deleted, compare deleted extents with free
> extents trim them and then removes it from the bb_bitmap_deleted.
>
> But you may notice, that there is one problem. bb_bitmap_deleted does
> not survive umount. To bypass the problem the first ioctl call have to
> walk through whole file system trimming all free extents.
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ---
>  fs/ext4/ext4.h    |    4 +
>  fs/ext4/mballoc.c |  207 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  fs/ext4/super.c   |    1 +
>  3 files changed, 202 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bf938cf..e25f672 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
>  extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
>  extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
>                                                ext4_group_t, int);
> +extern int ext4_trim_fs(unsigned int, struct super_block *);
> +
>  /* inode.c */
>  struct buffer_head *ext4_getblk(handle_t *, struct inode *,
>                                                ext4_lblk_t, int, int *);
> @@ -1682,6 +1684,8 @@ struct ext4_group_info {
>  #ifdef DOUBLE_CHECK
>        void            *bb_bitmap;
>  #endif
> +       void            *bb_bitmap_deleted;
> +       ext4_grpblk_t   bb_deleted;
>        struct rw_semaphore alloc_sem;
>        ext4_grpblk_t   bb_counters[];  /* Nr of free power-of-two-block
>                                         * regions, index is order.
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index bde9d0b..fbc83fe 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2255,6 +2255,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
>        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
>        init_rwsem(&meta_group_info[i]->alloc_sem);
>        meta_group_info[i]->bb_free_root = RB_ROOT;
> +       meta_group_info[i]->bb_deleted = -1;
> +
> +
>
>  #ifdef DOUBLE_CHECK
>        {
> @@ -2469,6 +2472,7 @@ int ext4_mb_release(struct super_block *sb)
>  #ifdef DOUBLE_CHECK
>                        kfree(grinfo->bb_bitmap);
>  #endif
> +                       kfree(grinfo->bb_bitmap_deleted);
>                        ext4_lock_group(sb, i);
>                        ext4_mb_cleanup_pa(grinfo);
>                        ext4_unlock_group(sb, i);
> @@ -2528,6 +2532,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>        int err, count = 0, count2 = 0;
>        struct ext4_free_data *entry;
>        struct list_head *l, *ltmp;
> +       void *bitmap_deleted = NULL;
>
>        list_for_each_safe(l, ltmp, &txn->t_private_list) {
>                entry = list_entry(l, struct ext4_free_data, list);
> @@ -2543,6 +2548,14 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>                /* there are blocks to put in buddy to make them really free */
>                count += entry->count;
>                count2++;
> +
> +               if (test_opt(sb, DISCARD) &&
> +                       (db->bb_bitmap_deleted == NULL) &&
> +                       (db->bb_deleted >= 0)) {
> +                       bitmap_deleted =
> +                               kmalloc(sb->s_blocksize, GFP_KERNEL);
> +               }
> +
>                ext4_lock_group(sb, entry->group);
>                /* Take it out of per group rb tree */
>                rb_erase(&entry->node, &(db->bb_free_root));
> @@ -2555,17 +2568,24 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
>                        page_cache_release(e4b.bd_buddy_page);
>                        page_cache_release(e4b.bd_bitmap_page);
>                }
> -               ext4_unlock_group(sb, entry->group);
> -               if (test_opt(sb, DISCARD)) {
> -                       ext4_fsblk_t discard_block;
> -
> -                       discard_block = entry->start_blk +
> -                               ext4_group_first_block_no(sb, entry->group);
> -                       trace_ext4_discard_blocks(sb,
> -                                       (unsigned long long)discard_block,
> -                                       entry->count);
> -                       sb_issue_discard(sb, discard_block, entry->count);
> +               if (test_opt(sb, DISCARD) && (db->bb_deleted >= 0)) {
> +                       if (db->bb_bitmap_deleted == NULL) {
> +                               db->bb_bitmap_deleted = bitmap_deleted;
> +                               BUG_ON(db->bb_bitmap_deleted == NULL);
> +
> +                               bitmap_deleted = NULL;
> +                               mb_clear_bits(db->bb_bitmap_deleted,
> +                                       0, EXT4_BLOCKS_PER_GROUP(sb));
> +                       }
> +
> +                       db->bb_deleted += entry->count;
> +                       mb_set_bits(db->bb_bitmap_deleted, entry->start_blk,
> +                                entry->count);
>                }
> +               ext4_unlock_group(sb, entry->group);
> +
> +               kfree(bitmap_deleted);
> +
>                kmem_cache_free(ext4_free_ext_cachep, entry);
>                ext4_mb_release_desc(&e4b);
>        }
> @@ -4639,3 +4659,170 @@ error_return:
>                kmem_cache_free(ext4_ac_cachep, ac);
>        return;
>  }
> +
> +/* Trim "count" blocks starting at "start" in "group"
> + * This must be called under group lock
> + */
> +void ext4_trim_extent(struct super_block *sb, int start, int count,
> +               ext4_group_t group, struct ext4_buddy *e4b)
> +{
> +       ext4_fsblk_t discard_block;
> +       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +       struct ext4_free_extent ex;
> +
> +       assert_spin_locked(ext4_group_lock_ptr(sb, group));
> +
> +       ex.fe_start = start;
> +       ex.fe_group = group;
> +       ex.fe_len = count;
> +
> +       mb_mark_used(e4b, &ex);
> +       ext4_unlock_group(sb, group);
> +
> +       discard_block = (ext4_fsblk_t)group *
> +                       EXT4_BLOCKS_PER_GROUP(sb)
> +                       + start
> +                       + le32_to_cpu(es->s_first_data_block);
> +       trace_ext4_discard_blocks(sb,
> +                       (unsigned long long)discard_block,
> +                       count);
> +       sb_issue_discard(sb, discard_block, count);
> +
> +       ext4_lock_group(sb, group);
> +       mb_free_blocks(NULL, e4b, start, ex.fe_len);
> +}

Mark, unless I'm missing something, sb_issue_discard() above is going
to trigger a trim command for just the one range.  I thought the
benchmarks you did showed that a collection of ranges needed to be
built, then a single trim command invoked that trimmed that group of
ranges.

Greg

> +
> +/* Trim all free blocks, which have at least minlen length */
> +ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
> +               ext4_grpblk_t minblocks)
> +{
> +       void *bitmap;
> +       ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
> +       ext4_grpblk_t start, next, count = 0;
> +       ext4_group_t group;
> +
> +       BUG_ON(e4b == NULL);
> +
> +       bitmap = e4b->bd_bitmap;
> +       group = e4b->bd_group;
> +       start = 0;
> +       ext4_lock_group(sb, group);
> +
> +       while (start < max) {
> +
> +               start = mb_find_next_zero_bit(bitmap, max, start);
> +               if (start >= max)
> +                       break;
> +               next = mb_find_next_bit(bitmap, max, start);
> +
> +               if ((next - start) >= minblocks) {
> +
> +                       count += next - start;
> +                       ext4_trim_extent(sb, start,
> +                               next - start, group, e4b);
> +
> +               }
> +               start = next + 1;
> +       }
> +
> +       e4b->bd_info->bb_deleted = 0;
> +       ext4_unlock_group(sb, group);
> +
> +       return count;
> +}
> +
> +/* Trim only blocks which is free and in bb_bitmap_deleted */
> +ext4_grpblk_t ext4_trim_deleted(struct super_block *sb, struct ext4_buddy *e4b,
> +               ext4_grpblk_t minblocks)
> +{
> +       void *bitmap;
> +       struct ext4_group_info *grp;
> +       ext4_group_t group;
> +       ext4_grpblk_t max, next, count = 0, start = 0;
> +
> +       BUG_ON(e4b == NULL);
> +
> +       bitmap = e4b->bd_bitmap;
> +       group = e4b->bd_group;
> +       grp = ext4_get_group_info(sb, group);
> +
> +       if (grp->bb_deleted < minblocks)
> +               return 0;
> +
> +       ext4_lock_group(sb, group);
> +
> +       while (start < EXT4_BLOCKS_PER_GROUP(sb)) {
> +               start = mb_find_next_bit(grp->bb_bitmap_deleted,
> +                       EXT4_BLOCKS_PER_GROUP(sb), start);
> +               max = mb_find_next_zero_bit(grp->bb_bitmap_deleted,
> +                       EXT4_BLOCKS_PER_GROUP(sb), start);
> +
> +               while (start < max) {
> +                       start = mb_find_next_zero_bit(bitmap, max, start);
> +                       if (start >= max)
> +                               break;
> +                       next = mb_find_next_bit(bitmap, max, start);
> +                       if (next > max)
> +                               next = max;
> +
> +                       if ((next - start) >= minblocks) {
> +                               count += next - start;
> +
> +                               ext4_trim_extent(sb, start,
> +                                       next - start, group, e4b);
> +
> +                               mb_clear_bits(grp->bb_bitmap_deleted,
> +                                       start, next - start);
> +                       }
> +                       start = next + 1;
> +               }
> +       }
> +       grp->bb_deleted -= count;
> +
> +       ext4_unlock_group(sb, group);
> +
> +       ext4_debug("trimmed %d blocks in the group %d\n",
> +               count, group);
> +
> +       return count;
> +}
> +
> +int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
> +{
> +       struct ext4_buddy e4b;
> +       struct ext4_group_info *grp;
> +       ext4_group_t group;
> +       ext4_group_t ngroups = ext4_get_groups_count(sb);
> +       ext4_grpblk_t minblocks;
> +
> +       if (!test_opt(sb, DISCARD))
> +               return 0;
> +
> +       minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
> +       if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
> +               return -EINVAL;
> +
> +       for (group = 0; group < ngroups; group++) {
> +               int err;
> +
> +               err = ext4_mb_load_buddy(sb, group, &e4b);
> +               if (err) {
> +                       ext4_error(sb, "Error in loading buddy "
> +                                       "information for %u", group);
> +                       continue;
> +               }
> +               grp = ext4_get_group_info(sb, group);
> +
> +               if (grp->bb_deleted < 0) {
> +                       /* First run after mount */
> +                       ext4_trim_all_free(sb, &e4b, minblocks);
> +               } else if (grp->bb_deleted >= minblocks) {
> +                       /* Trim only blocks deleted since first run */
> +                       ext4_trim_deleted(sb, &e4b, minblocks);
> +               }
> +
> +               ext4_mb_release_desc(&e4b);
> +       }
> +
> +       return 0;
> +}
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index e14d22c..253eb98 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
>        .quota_write    = ext4_quota_write,
>  #endif
>        .bdev_try_to_free_page = bdev_try_to_free_page,
> +       .trim_fs        = ext4_trim_fs
>  };
>
>  static const struct super_operations ext4_nojournal_sops = {
> --
> 1.6.6.1
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

next prev parent reply	other threads:[~2010-04-20 21:28 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-19 10:55 Ext4: batched discard support Lukas Czerner
2010-04-19 10:55 ` [PATCH 1/2] Add ioctl FITRIM Lukas Czerner
2010-04-19 10:55   ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
2010-04-20 21:21     ` Greg Freemyer [this message]
2010-04-21  2:26       ` Mark Lord
2010-04-21  2:45         ` Eric Sandeen
2010-04-21 18:59           ` Greg Freemyer
2010-04-21 19:04             ` Ric Wheeler
2010-04-21 19:22               ` Jeff Moyer
2010-04-21 20:44                 ` Greg Freemyer
2010-04-21 20:53                   ` Greg Freemyer
2010-04-21 21:01                   ` Eric Sandeen
2010-04-21 21:03                     ` Ric Wheeler
2010-04-21 21:47                       ` Greg Freemyer
2010-04-21 21:56                         ` James Bottomley
2010-04-21 21:59                         ` Mark Lord
2010-04-23  8:23                   ` Lukas Czerner
2010-04-24 13:24                     ` Greg Freemyer
2010-04-24 13:48                       ` Ric Wheeler
2010-04-24 14:30                         ` Greg Freemyer
2010-04-24 14:43                           ` Eric Sandeen
2010-04-24 15:03                             ` Greg Freemyer
2010-04-24 17:04                               ` Ric Wheeler
2010-04-24 18:30                                 ` Greg Freemyer
2010-04-24 18:41                                   ` Ric Wheeler
2010-04-26 14:00                                     ` Mark Lord
2010-04-26 14:42                                       ` Martin K. Petersen
2010-04-26 15:27                                         ` Greg Freemyer
2010-04-26 15:51                                           ` Lukas Czerner
2010-04-28  1:25                                           ` Mark Lord
2010-04-26 15:48                                         ` Ric Wheeler
2010-04-24 19:06                                   ` Martin K. Petersen
2010-04-26 14:03                                     ` Mark Lord
2010-04-24 18:39                       ` Martin K. Petersen
2010-04-26 16:55                     ` Jan Kara
2010-04-26 17:46                       ` Lukas Czerner
2010-04-26 17:52                         ` Ric Wheeler
2010-04-26 18:14                           ` Lukas Czerner
2010-04-26 18:28                             ` Jeff Moyer
2010-04-26 18:38                               ` [PATCH 2/2] Add batched discard support for ext4 - using rbtree Lukas Czerner
2010-04-26 18:42                                 ` Lukas Czerner
2010-04-27 15:29                                   ` Edward Shishkin
2010-04-21 20:52                 ` [PATCH 2/2] Add batched discard support for ext4 Greg Freemyer
2010-04-19 16:20 ` Ext4: batched discard support Greg Freemyer
2010-04-19 16:30   ` Eric Sandeen
2010-04-19 17:58     ` Greg Freemyer
2010-04-19 18:04       ` Ric Wheeler
2010-04-20 20:24   ` Mark Lord
2010-04-20 20:34     ` Mark Lord
  -- strict thread matches above, loose matches on Subject: below --
2010-07-07  7:53 Ext4: batched discard support - simplified version Lukas Czerner
2010-07-07  7:53 ` [PATCH 2/2] Add batched discard support for ext4 Lukas Czerner
2010-07-14  8:33   ` Dmitry Monakhov
2010-07-14  9:40     ` Lukas Czerner
2010-07-14 10:03       ` Dmitry Monakhov
2010-07-14 11:43         ` Lukas Czerner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=r2k87f94c371004201421se796c0ekbae28495e62a9eb7@mail.gmail.com \
    --to=greg.freemyer@gmail.com \
    --cc=esandeen@redhat.com \
    --cc=eshishki@redhat.com \
    --cc=jmoyer@redhat.com \
    --cc=kernel@teksavvy.com \
    --cc=lczerner@redhat.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=rwheeler@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).