From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: Justin Forbes <jmforbes@linuxtx.org>,
Zwane Mwaikambo <zwane@arm.linux.org.uk>,
Theodore Ts'o <tytso@mit.edu>,
Randy Dunlap <rdunlap@xenotime.net>,
Dave Jones <davej@redhat.com>,
Chuck Wolber <chuckw@quantumlinux.com>,
Chris Wedgwood <reviews@ml.cw.f00f.org>,
Michael Krufky <mkrufky@linuxtv.org>,
Chuck Ebbert <cebbert@redhat.com>,
Domenico Andreoli <cavokz@gmail.com>, Willy Tarreau <w@1wt.eu>,
Rodrigo Rubira Branco <rbranco@la.checkpoint.com>,
Jake Edge <jake@lwn.net>, Eugene Teo <eteo@redhat.com>,
torvalds@linux-foundation.org, akpm@linux-foundation.org,
alan@lxorguk.ukuu.org.uk, linux-ext4@vger.kernel.org,
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Subject: [patch 23/36] ext4: Use an rbtree for tracking blocks freed during transaction.
Date: Wed, 18 Feb 2009 14:29:39 -0800 [thread overview]
Message-ID: <20090218222939.GX10668@kroah.com> (raw)
In-Reply-To: <20090218222841.GA10668@kroah.com>
[-- Attachment #1: ext4-use-an-rbtree-for-tracking-blocks-freed-during-transaction.patch --]
[-- Type: text/plain, Size: 10278 bytes --]
2.6.27-stable review patch. If anyone has any objections, please let us know.
------------------
From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
(cherry picked from commit c894058d66637c7720569fbe12957f4de64d9991 to allow
commit e21675d4 to be included in 2.6.27.y)
With this patch we track the block freed during a transaction using
red-black tree. We also make sure contiguous blocks freed are collected
in one node in the tree.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
fs/ext4/mballoc.c | 186 ++++++++++++++++++++++++++++++++++--------------------
fs/ext4/mballoc.h | 25 ++++---
2 files changed, 134 insertions(+), 77 deletions(-)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -332,6 +332,7 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
@@ -2506,6 +2507,7 @@ int ext4_mb_add_groupinfo(struct super_b
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;
#ifdef DOUBLE_CHECK
{
@@ -2819,13 +2821,11 @@ int ext4_mb_release(struct super_block *
static noinline_for_stack void
ext4_mb_free_committed_blocks(struct super_block *sb)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;
if (list_empty(&sbi->s_committed_transaction))
return;
@@ -2833,44 +2833,46 @@ ext4_mb_free_committed_blocks(struct sup
/* there is committed blocks to be freed yet */
do {
/* get next array of blocks */
- md = NULL;
+ entry = NULL;
spin_lock(&sbi->s_md_lock);
if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
+ entry = list_entry(sbi->s_committed_transaction.next,
+ struct ext4_free_data, list);
+ list_del(&entry->list);
}
spin_unlock(&sbi->s_md_lock);
- if (md == NULL)
+ if (entry == NULL)
break;
mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);
- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- }
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->group);
- kfree(md);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ } while (1);
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -3025,6 +3027,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
#ifdef CONFIG_PROC_FS
proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
if (proc_root_ext4 == NULL)
@@ -3041,6 +3053,7 @@ void exit_ext4_mballoc(void)
#ifdef CONFIG_PROC_FS
remove_proc_entry("fs/ext4", NULL);
#endif
+ kmem_cache_destroy(ext4_free_ext_cachep);
}
@@ -3561,6 +3574,7 @@ ext4_mb_use_preallocated(struct ext4_all
ac->ac_criteria = 20;
return 1;
}
+
return 0;
}
@@ -4678,6 +4692,21 @@ static void ext4_mb_poll_new_transaction
ext4_mb_free_committed_blocks(sb);
}
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ return 1;
+ return 0;
+}
+
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group, ext4_grpblk_t block, int count)
@@ -4685,57 +4714,80 @@ ext4_mb_free_metadata(handle_t *handle,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }
- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }
- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to active_transaction list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -97,22 +97,27 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS 30
+struct ext4_free_data {
+ /* this links the free block information from group_info */
+ struct rb_node node;
-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+ /* this links the free block information from ext4_sb_info */
struct list_head list;
+
+ /* group which free block extent belongs */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction which freed this extent */
+ tid_t t_tid;
};
struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
next prev parent reply other threads:[~2009-02-18 22:32 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20090218222447.432108614@mini.kroah.org>
2009-02-18 22:28 ` [patch 00/36] 2.6.27.19-stable review Greg KH
2009-02-18 22:28 ` [patch 01/36] pid: implement ns_of_pid Greg KH
2009-02-18 22:28 ` [patch 02/36] mqueue: fix si_pid value in mqueue do_notify() Greg KH
2009-02-18 22:29 ` [patch 03/36] Fix Intel IOMMU write-buffer flushing Greg KH
2009-02-18 23:02 ` Thomas Backlund
2009-02-19 23:59 ` Thomas Backlund
2009-02-20 3:35 ` Greg KH
2009-02-20 13:57 ` Thomas Backlund
2009-02-21 10:34 ` David Woodhouse
2009-02-18 22:29 ` [patch 04/36] powerpc/vsx: Fix VSX alignment handler for regs 32-63 Greg KH
2009-02-18 22:29 ` [patch 05/36] sata_nv: give up hardreset on nf2 Greg KH
2009-02-18 22:29 ` [patch 06/36] 3c505: do not set pcb->data.raw beyond its size Greg KH
2009-02-18 22:29 ` [patch 07/36] Add support for VT6415 PCIE PATA IDE Host Controller Greg KH
2009-02-18 22:29 ` [patch 08/36] Bluetooth: Fix TX error path in btsdio driver Greg KH
2009-02-18 22:29 ` [patch 09/36] btsdio: free sk_buff with kfree_skb Greg KH
2009-02-18 22:29 ` [patch 10/36] ext2/xip: refuse to change xip flag during remount with busy inodes Greg KH
2009-02-18 22:29 ` [patch 11/36] SCSI: libiscsi: fix iscsi pool leak Greg KH
2009-02-18 22:29 ` [patch 12/36] x86/cpa: make sure cpa is safe to call in lazy mmu mode Greg KH
2009-02-18 22:29 ` [patch 13/36] ext4: Add support for non-native signed/unsigned htree hash algorithms Greg KH
2009-02-18 22:29 ` [patch 14/36] ext4: tone down ext4_da_writepages warnings Greg KH
2009-02-18 22:29 ` [patch 15/36] ext4: Fix the delalloc writepages to allocate blocks at the right offset Greg KH
2009-02-18 22:29 ` [patch 16/36] ext4: avoid ext4_error when mounting a fs with a single bg Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` [patch 17/36] ext4: Widen type of ext4_sb_info.s_mb_maxs[] Greg KH
2009-02-18 22:29 ` [patch 18/36] jbd2: Add barrier not supported test to journal_wait_on_commit_record Greg KH
2009-02-18 22:29 ` [patch 19/36] ext4: Dont overwrite allocation_context ac_status Greg KH
2009-02-18 22:29 ` [patch 20/36] ext4: Add blocks added during resize to bitmap Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` [patch 21/36] ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` [patch 22/36] ext4: cleanup mballoc header files Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` Greg KH [this message]
2009-02-18 22:29 ` [patch 24/36] ext4: dont use blocks freed but not yet committed in buddy cache init Greg KH
2009-02-18 22:29 ` [patch 25/36] ext4: Fix race between read_block_bitmap() and mark_diskspace_used() Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` [patch 26/36] ext4: Fix the race between read_inode_bitmap() and ext4_new_inode() Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` [patch 27/36] jbd2: Add BH_JBDPrivateStart Greg KH
2009-02-18 22:29 ` [patch 28/36] ext4: Use new buffer_head flag to check uninit group bitmaps initialization Greg KH
2009-02-18 22:29 ` [patch 29/36] ext4: mark the blocks/inode bitmap beyond end of group as used Greg KH
2009-02-18 22:29 ` Greg KH
2009-02-18 22:29 ` [patch 30/36] ext4: Dont allow new groups to be added during block allocation Greg KH
2009-02-18 22:29 ` [patch 31/36] ext4: Init the complete page while building buddy cache Greg KH
2009-02-18 22:29 ` [patch 32/36] ext4: Add sanity checks for the superblock before mounting the filesystem Greg KH
2009-02-18 22:29 ` [patch 33/36] ext4: only use i_size_high for regular files Greg KH
2009-02-18 22:29 ` [patch 34/36] ext4: Add sanity check to make_indexed_dir Greg KH
2009-02-18 22:30 ` [patch 35/36] jbd2: On a __journal_expect() assertion failure printk "JBD2", not "EXT3-fs" Greg KH
2009-02-18 22:30 ` [patch 36/36] ext4: Initialize the new group descriptor when resizing the filesystem Greg KH
2009-02-19 20:48 ` [patch 00/36] 2.6.27.19-stable review Jörg-Volker Peetz
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090218222939.GX10668@kroah.com \
--to=gregkh@suse.de \
--cc=akpm@linux-foundation.org \
--cc=alan@lxorguk.ukuu.org.uk \
--cc=aneesh.kumar@linux.vnet.ibm.com \
--cc=cavokz@gmail.com \
--cc=cebbert@redhat.com \
--cc=chuckw@quantumlinux.com \
--cc=davej@redhat.com \
--cc=eteo@redhat.com \
--cc=jake@lwn.net \
--cc=jmforbes@linuxtx.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mkrufky@linuxtv.org \
--cc=rbranco@la.checkpoint.com \
--cc=rdunlap@xenotime.net \
--cc=reviews@ml.cw.f00f.org \
--cc=stable@kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=tytso@mit.edu \
--cc=w@1wt.eu \
--cc=zwane@arm.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.