linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] EXT3 extents against 2.6.0-test7
@ 2003-10-13 18:27 Alex Tomas
  2003-10-13 21:39 ` Ed Sweetman
  0 siblings, 1 reply; 9+ messages in thread
From: Alex Tomas @ 2003-10-13 18:27 UTC (permalink / raw)
  To: linux-kernel; +Cc: Stephen Hemminger, Alex Tomas


changes since last publication:
1) few bugs fixed: they caused extents tree corruption
2) several asserts added
3) binary search used to find an extent
4) last found entry is cached: this allows to skip tree
   traversal and saves cpu a bit
5) truncate_sem is used to serialize get_block()/trucate()


with best wishes, Alex




%patch
Index: linux-2.6.0-test7/fs/ext3/extents.c
===================================================================
--- linux-2.6.0-test7.orig/fs/ext3/extents.c	2003-10-11 15:59:36.000000000 +0400
+++ linux-2.6.0-test7/fs/ext3/extents.c	2003-10-11 16:13:43.000000000 +0400
@@ -0,0 +1,1766 @@
+/*
+ *
+ * linux/fs/ext3/extents.c
+ *
+ * Extents support for EXT3
+ *
+ * 07/08/2003    Alex Tomas <alex@clusterfs.com>
+ * 
+ * TODO:
+ *   - ext3*_error() should be used in some situations
+ *   - find_goal() [to be tested and improved]
+ *   - tree reduction
+ *   - arch-independent
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext3_jbd.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+
+/*
+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
+ * become very little, so index split, in-depth growing and
+ * other hard changes happens much more often
+ * this is for debug purposes only
+ */
+#define AGRESSIVE_TEST_
+
+/*
+ * if CHECK_BINSEARCH defined, then results of binary search
+ * will be checked by linear search
+ */
+#define CHECK_BINSEARCH_
+
+/*
+ * if EXT_DEBUG defined you can use 'extdebug' mount option
+ * to get lots of info what's going on
+ */
+#define EXT_DEBUG
+#ifdef EXT_DEBUG
+#define ext_debug(inode,fmt,a...) 		\
+do {						\
+	if (test_opt((inode)->i_sb, EXTDEBUG))	\
+		printk(fmt, ##a);		\
+} while (0);
+#else
+#define ext_debug(inode,fmt,a...)
+#endif
+
+#define EXT3_ALLOC_NEEDED	2	/* block bitmap + group descriptor */
+
+/*
+ * ext3_inode has i_block array (total 60 bytes)
+ * first 4 bytes are used to store:
+ *  - tree depth (0 mean there is no tree yet. all extents in the inode)
+ *  - number of alive extents in the inode
+ */
+
+/*
+ * this is extent on-disk structure
+ * it's used at the bottom of the tree
+ */
+struct ext3_extent {
+	__u32	e_block;	/* first logical block extent covers */
+	__u32	e_start;	/* first physical block extents lives */
+	__u32	e_num;		/* number of blocks covered by extent */
+};
+
+/*
+ * this is index on-disk structure
+ * it's used at all the levels, but the bottom
+ */
+struct ext3_extent_idx {
+	__u32	e_block;	/* index covers logical blocks from 'block' */
+	__u32	e_leaf;		/* pointer to the physical block of the next *
+				 * level. leaf or next index could bet here */
+};
+
+/*
+ * each block (leaves and indexes), even inode-stored has header
+ */
+struct ext3_extent_header {	
+	__u16	e_num;		/* number of valid entries */
+	__u16	e_max;		/* capacity of store in entries */
+};
+
+/*
+ * array of ext3_ext_path contains path to some extent
+ * creation/lookup routines use it for traversal/splitting/etc
+ * truncate uses it to simulate recursive walking
+ */
+struct ext3_ext_path {
+	__u32				p_block;
+	__u16				p_depth;
+	struct ext3_extent		*p_ext;
+	struct ext3_extent_idx		*p_idx;
+	struct ext3_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext3_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext3_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext3_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext3_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max)
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1)
+
+
+#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext3_ext_get_access(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext3_journal_get_write_access(handle, path->p_bh);
+	}
+
+	/* path points to leaf/index in inode body */
+	return 0;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+static int ext3_ext_dirty(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext3_journal_dirty_metadata(handle, path->p_bh);
+	}
+
+	/* path points to leaf/index in inode body */
+	return ext3_mark_inode_dirty(handle, inode);
+}
+
+static inline int ext3_ext_space_block(struct inode *inode)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header))
+		/ sizeof(struct ext3_extent);
+#ifdef AGRESSIVE_TEST
+	size = 6; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static inline int ext3_ext_space_inode(struct inode *inode)
+{
+	int size;
+
+	size = (sizeof(EXT3_I(inode)->i_data) -
+			sizeof(struct ext3_extent_header))
+			/ sizeof(struct ext3_extent);
+#ifdef AGRESSIVE_TEST
+	size = 3; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static inline int ext3_ext_space_inode_idx(struct inode *inode)
+{
+	int size;
+
+	size = (sizeof(EXT3_I(inode)->i_data) -
+			sizeof(struct ext3_extent_header))
+			/ sizeof(struct ext3_extent_idx);
+#ifdef AGRESSIVE_TEST
+	size = 4; /* FIXME: for debug, remove this line */
+#endif
+	return size;
+}
+
+static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path)
+{
+#ifdef EXT_DEBUG
+	int k, l = path->p_depth;
+
+	ext_debug(inode, "path:");
+	for (k = 0; k <= l; k++, path++) {
+		if (path->p_idx) {
+			ext_debug(inode, "  %d->%d", path->p_idx->e_block,
+					path->p_idx->e_leaf);
+		} else if (path->p_ext) {
+			ext_debug(inode, "  %d:%d:%d",
+					path->p_ext->e_block,
+					path->p_ext->e_start,
+					path->p_ext->e_num);
+		} else
+			ext_debug(inode, "  []");
+	}
+	ext_debug(inode, "\n");
+#endif
+}
+
+static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path)
+{
+#ifdef EXT_DEBUG
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent_header *eh;
+	struct ext3_extent *ex;
+	int i;
+
+	if (!path)
+		return;
+
+	eh = path[depth].p_hdr;
+	ex = EXT_FIRST_EXTENT(eh);
+
+	for (i = 0; i < eh->e_num; i++, ex++) {
+		ext_debug(inode, "%d:%d:%d ",
+				ex->e_block, ex->e_start, ex->e_num);
+	}
+	ext_debug(inode, "\n");
+#endif
+}
+
+static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path)
+{
+	int depth = path->p_depth;
+	int i;
+
+	for (i = 0; i <= depth; i++, path++)
+		if (path->p_bh) {
+			brelse(path->p_bh);
+			path->p_bh = NULL;
+		}
+}
+
+static inline void ext3_ext_invalidate_cache(struct inode *inode)
+{
+	EXT3_I(inode)->i_ce_num = 0;
+}
+
+static inline void ext3_ext_put_in_cache(struct inode *inode,
+						struct ext3_extent *ex)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+
+	EXT_ASSERT(ex);
+	EXT_ASSERT(ex->e_num);
+	
+	ei->i_ce_block = ex->e_block;
+	ei->i_ce_start = ex->e_start;
+	ei->i_ce_num = ex->e_num;
+}
+
+static inline __u32 ext3_ext_is_in_cache(struct inode *inode, __u32 block)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+
+	if (ei->i_ce_num == 0) {
+		/* cache is empty */
+		return 0;
+	}
+
+	if (block >= ei->i_ce_block && block < ei->i_ce_block + ei->i_ce_num)
+		return block - ei->i_ce_block + ei->i_ce_start;
+
+	/* not in cache */
+	return 0;
+}
+
+static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned long bg_start;
+	unsigned long colour;
+	int depth;
+	
+	if (path) {
+		depth = path->p_depth;
+		/* try to find previous block */
+		if (path[depth].p_ext)
+			return path[depth].p_ext->e_start +
+				path[depth].p_ext->e_num - 1;
+		
+		/* it looks index is empty
+		 * try to find starting from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+/*
+ * binary search for closest index by given block
+ */
+static inline void ext3_ext_binsearch_idx(struct inode *inode,
+					struct ext3_ext_path *path,
+					int block)
+{
+	struct ext3_extent_header *eh = path->p_hdr;
+	struct ext3_extent_idx *ix;
+	int l = 0, k, r;
+
+	EXT_ASSERT(eh->e_num <= eh->e_max);
+	EXT_ASSERT(eh->e_num > 0);
+
+	ext_debug(inode, "binsearch for %d(idx):  ", block);
+
+	path->p_idx = ix = EXT_FIRST_INDEX(eh);
+
+	r = k = eh->e_num;
+	while (k > 1) {
+		k = (r - l) / 2;
+		if (block < ix[l + k].e_block)
+			r -= k;
+		else
+			l += k;
+		ext_debug(inode, "%d:%d:%d ", k, l, r);
+	}
+
+	ix += l;
+	path->p_idx = ix;
+	ext_debug(inode, "  -> %d->%d ", path->p_idx->e_block,
+			path->p_idx->e_leaf);
+
+	while (l++ < r) {
+		if (block < ix->e_block) 
+			break;
+		path->p_idx = ix++;
+	}
+	ext_debug(inode, "  -> %d->%d\n", path->p_idx->e_block,
+			path->p_idx->e_leaf);
+
+#ifdef CHECK_BINSEARCH 
+	{
+		struct ext3_extent_idx *chix;
+
+		chix = ix = EXT_FIRST_INDEX(eh);
+		for (k = 0; k < eh->e_num; k++, ix++) {
+			if (block < ix->e_block) 
+				break;
+			chix = ix;
+		}
+		EXT_ASSERT(chix == path->p_idx);
+	}
+#endif
+
+}
+
+/*
+ * binary search for closest extent by given block
+ */
+static inline void ext3_ext_binsearch(struct inode *inode,
+					struct ext3_ext_path *path,
+					int block)
+{
+	struct ext3_extent_header *eh = path->p_hdr;
+	struct ext3_extent *ex;
+	int l = 0, k, r;
+
+	EXT_ASSERT(eh->e_num <= eh->e_max);
+
+	if (eh->e_num == 0) {
+		/*
+		 * this leaf is empty yet:
+		 *  we get such a leaf in split/add case
+		 */
+		return;
+	}
+	
+	ext_debug(inode, "binsearch for %d:  ", block);
+
+	path->p_ext = ex = EXT_FIRST_EXTENT(eh);
+
+	r = k = eh->e_num;
+	while (k > 1) {
+		k = (r - l) / 2;
+		if (block < ex[l + k].e_block)
+			r -= k;
+		else
+			l += k;
+		ext_debug(inode, "%d:%d:%d ", k, l, r);
+	}
+
+	ex += l;
+	path->p_ext = ex;
+	ext_debug(inode, "  -> %d:%d:%d ", path->p_ext->e_block,
+			path->p_ext->e_start, path->p_ext->e_num);
+
+	while (l++ < r) {
+		EXT_ASSERT(ex->e_num < EXT3_BLOCKS_PER_GROUP(inode->i_sb));
+		if (block < ex->e_block) 
+			break;
+		path->p_ext = ex++;
+	}
+	ext_debug(inode, "  -> %d:%d:%d\n", path->p_ext->e_block,
+			path->p_ext->e_start, path->p_ext->e_num);
+
+#ifdef CHECK_BINSEARCH 
+	{
+		struct ext3_extent *chex;
+
+		chex = ex = EXT_FIRST_EXTENT(eh);
+		for (k = 0; k < eh->e_num; k++, ex++) {
+			if (block < ex->e_block) 
+				break;
+			chex = ex;
+		}
+		EXT_ASSERT(chex == path->p_ext);
+	}
+#endif
+
+}
+
+static struct ext3_ext_path *
+ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_extent_header *eh = (void *) ei->i_data;
+	struct buffer_head *bh;
+	int depth, i, ppos = 0;
+	
+	eh = (struct ext3_extent_header *) ei->i_data;
+
+	/* initialize capacity of leaf in inode for first time */
+	if (eh->e_max == 0)
+		eh->e_max = ext3_ext_space_inode(inode);
+	i = depth = ei->i_depth;
+	EXT_ASSERT(i == 0 || eh->e_num > 0);
+	
+	/* account possible depth increase */
+	if (!path) {
+		path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
+				GFP_NOFS);
+		if (!path)
+			return ERR_PTR(-ENOMEM);
+	}
+	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
+	path[0].p_hdr = eh;
+
+	/* walk through the tree */
+	while (i) {
+		ext_debug(inode, "depth %d: num %d, max %d\n",
+				ppos, eh->e_num, eh->e_max);
+		ext3_ext_binsearch_idx(inode, path + ppos, block);
+		path[ppos].p_block = path[ppos].p_idx->e_leaf;
+		path[ppos].p_depth = i;
+		path[ppos].p_ext = NULL;
+
+		bh = sb_bread(inode->i_sb, path[ppos].p_block);
+		if (!bh) {
+			ext3_ext_drop_refs(inode, path);
+			kfree(path);
+			return ERR_PTR(-EIO);
+		}
+		eh = (struct ext3_extent_header *) bh->b_data;
+		ppos++;
+		EXT_ASSERT(ppos <= depth);
+		path[ppos].p_bh = bh;
+		path[ppos].p_hdr = eh;
+		i--;
+	}
+
+	path[ppos].p_depth = i;
+	path[ppos].p_hdr = eh;
+	path[ppos].p_ext = NULL;
+	
+	/* find extent */
+	ext3_ext_binsearch(inode, path + ppos, block);
+
+	ext3_ext_show_path(inode, path);
+
+	return path;
+}
+
+static void ext3_ext_check_boundary(struct inode *inode,
+					struct ext3_ext_path *curp,
+					void *addr, int len)
+{
+	void *end;
+
+	if (!len)
+		return;
+	if (curp->p_bh)
+		end = (void *) curp->p_hdr + inode->i_sb->s_blocksize;
+	else
+		end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data);
+	if (((unsigned long) addr) + len > (unsigned long) end) {
+		printk("overflow! 0x%p > 0x%p\n", addr + len, end);
+		BUG();
+	}
+	if ((unsigned long) addr < (unsigned long) curp->p_hdr) {
+		printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr);
+		BUG();
+	}
+}
+
+/*
+ * insert new index [logical;ptr] into the block at cupr
+ * it check where to insert: before curp or after curp
+ */
+static int ext3_ext_insert_index(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *curp, int logical,
+				int ptr)
+{
+	struct ext3_extent_idx *ix;
+	int len, err;
+
+	if ((err = ext3_ext_get_access(handle, inode, curp)))
+		return err;
+
+	EXT_ASSERT(logical != curp->p_idx->e_block);
+	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+	if (logical > curp->p_idx->e_block) {
+		/* insert after */
+		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
+			len = (len - 1) * sizeof(struct ext3_extent_idx);
+			len = len < 0 ? 0 : len;
+			ext_debug(inode, "insert new index %d after: %d. "
+					"move %d from 0x%p to 0x%p\n",
+					logical, ptr, len,
+					(curp->p_idx + 1), (curp->p_idx + 2));
+
+			ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len);
+			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+		}
+		ix = curp->p_idx + 1;
+	} else {
+		/* insert before */
+		len = len * sizeof(struct ext3_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert new index %d before: %d. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				curp->p_idx, (curp->p_idx + 1));
+
+		ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len);
+		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ix = curp->p_idx;
+	}
+
+	ix->e_block = logical;
+	ix->e_leaf = ptr;
+	curp->p_hdr->e_num++;
+
+	err = ext3_ext_dirty(handle, inode, curp);
+	ext3_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+/*
+ * routine inserts new subtree into the path, using free index entry
+ * at depth 'at:
+ *  - allocates all needed blocks (new leaf and all intermediate index blocks)
+ *  - makes decision where to split
+ *  - moves remaining extens and index entries (right to the split point)
+ *    into the newly allocated blocks
+ *  - initialize subtree
+ */
+static int ext3_ext_split(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path,
+				struct ext3_extent *newext, int at)
+{
+	struct buffer_head *bh = NULL;
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent_header *neh;
+	struct ext3_extent_idx *fidx;
+	struct ext3_extent *ex;
+	int i = at, k, m, a;
+	sector_t newblock, oldblock, border;
+	int *ablocks = NULL; /* array of allocated blocks */
+	int err = 0;
+
+	/* make decision: where to split? */
+	/* FIXME: now desicion is simplest: at current extent */
+
+	/* if current leaf will be splitted, then we should use 
+	 * border from split point */
+	
+	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		border = path[depth].p_ext[1].e_block;
+		ext_debug(inode, "leaf will be splitted."
+				" next leaf starts at %d\n",
+				(int)border);
+	} else {
+		border = newext->e_block;
+		ext_debug(inode, "leaf will be added."
+				" next leaf starts at %d\n",
+				(int)border);
+	}
+
+	/* 
+	 * if error occurs, then we break processing
+	 * and turn filesystem read-only. so, index won't
+	 * be inserted and tree will be in consistent
+	 * state. next mount will repair buffers too
+	 */
+
+	/*
+	 * get array to track all allocated blocks
+	 * we need this to handle errors and free blocks
+	 * upon them
+	 */
+	ablocks = kmalloc(sizeof(sector_t) * depth, GFP_NOFS);
+	if (!ablocks)
+		return -ENOMEM;
+	memset(ablocks, 0, sizeof(sector_t) * depth);
+
+	/* allocate all needed blocks */
+	ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
+	newblock = 0; /* FIXME: something more sophisticated needed here */ 
+	for (a = 0; newext->e_num > 0 && a < depth - at; a++) {
+		newblock = ablocks[a] = newext->e_start++;
+		newext->e_num--;
+	}
+	for (; a < depth - at; a++) {
+		newblock = ext3_new_block(handle, inode,
+						newblock + 1, 0, 0, &err);
+		if (newblock == 0)
+			goto cleanup;
+		ablocks[a] = newblock;
+	}
+
+	/* initialize new leaf */
+	newblock = ablocks[--a];
+	EXT_ASSERT(newblock);
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		goto cleanup;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext3_journal_get_create_access(handle, bh)))
+		goto cleanup;
+
+	neh = (struct ext3_extent_header *) bh->b_data;
+	neh->e_num = 0;
+	neh->e_max = ext3_ext_space_block(inode);
+	ex = EXT_FIRST_EXTENT(neh);
+
+	/* move remain of path[depth] to the new leaf */
+	EXT_ASSERT(path[depth].p_hdr->e_num ==
+			path[depth].p_hdr->e_max);
+	/* start copy from next extent */
+	/* TODO: we could do it by single memmove */
+	m = 0;
+	path[depth].p_ext++;
+	while (path[depth].p_ext <=
+			EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug(inode, "move %d:%d:%d in new leaf\n",
+				path[depth].p_ext->e_block,
+				path[depth].p_ext->e_start,
+				path[depth].p_ext->e_num);
+		memmove(ex++, path[depth].p_ext++,
+				sizeof(struct ext3_extent));
+		neh->e_num++;
+		m++;
+	}
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext3_journal_dirty_metadata(handle, bh)))
+		goto cleanup;	
+	brelse(bh);
+	bh = NULL;
+
+	/* correct old leaf */
+	if (m) {
+		if ((err = ext3_ext_get_access(handle, inode, path)))
+			goto cleanup;
+		path[depth].p_hdr->e_num -= m;
+		if ((err = ext3_ext_dirty(handle, inode, path)))
+			goto cleanup;
+		
+	}
+
+	/* create intermediate indexes */
+	k = depth - at - 1;
+	EXT_ASSERT(k >= 0);
+	if (k)
+		ext_debug(inode,
+				"create %d intermediate indices\n", k);
+	/* insert new index into current index block */
+	/* current depth stored in i var */
+	i = depth - 1;
+	while (k--) {
+		oldblock = newblock;
+		newblock = ablocks[--a];
+		bh = sb_getblk(inode->i_sb, newblock);
+		if (!bh) {
+			err = -EIO;
+			goto cleanup;
+		}
+		lock_buffer(bh);
+
+		if ((err = ext3_journal_get_create_access(handle, bh)))
+			goto cleanup;
+
+		neh = (struct ext3_extent_header *) bh->b_data;
+		neh->e_num = 1;
+		neh->e_max = ext3_ext_space_block(inode);
+		fidx = EXT_FIRST_INDEX(neh);
+		fidx->e_block = border;
+		fidx->e_leaf = oldblock;
+
+		ext_debug(inode,
+				"int.index at %d (block %u): %d -> %d\n",
+				i, (unsigned) newblock,
+				(int) border,
+				(int) oldblock);
+		/* copy indexes */
+		m = 0;
+		path[i].p_idx++;
+		ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
+				EXT_LAST_INDEX(path[i].p_hdr));
+		while (path[i].p_idx <=
+				EXT_MAX_INDEX(path[i].p_hdr)) {
+			ext_debug(inode, "%d: move %d:%d in new index\n",
+					i, path[i].p_idx->e_block,
+					path[i].p_idx->e_leaf);
+			memmove(++fidx, path[i].p_idx++,
+					sizeof(struct ext3_extent_idx));
+			neh->e_num++;
+			m++;
+		}
+
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		if ((err = ext3_journal_dirty_metadata(handle, bh)))
+			goto cleanup;
+		brelse(bh);
+		bh = NULL;
+
+		/* correct old index */
+		if (m) {
+			err = ext3_ext_get_access(handle,inode,path+i);
+			if (err)
+				goto cleanup;
+			path[i].p_hdr->e_num -= m;
+			err = ext3_ext_dirty(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+		}
+
+		i--;
+	}
+
+	/* insert new index */
+	if (!err)
+		err = ext3_ext_insert_index(handle, inode, path + at,
+						border, newblock);
+
+cleanup:
+	if (bh) {
+		if (buffer_locked(bh))
+			unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	if (err) {
+		/* free all allocated blocks in error case */
+		for (i = 0; i < depth; i++)
+			if (!ablocks[i])
+				continue;
+			ext3_free_blocks(handle, inode, ablocks[i], 1);
+	}
+	kfree(ablocks);
+
+	return err;
+}
+
+/*
+ * routine implements tree growing procedure:
+ *  - allocates new block
+ *  - moves top-level data (index block or leaf) into the new block
+ *  - initialize new top-level, creating index that points to the
+ *    just created block
+ */
+static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path,
+					struct ext3_extent *newext)
+{
+	struct buffer_head *bh;
+	struct ext3_ext_path *curp = path;
+	struct ext3_extent_header *neh;
+	struct ext3_extent_idx *fidx;
+	int len, err = 0;
+	sector_t newblock;
+
+	/*
+	 * use already allocated by the called block for new root block
+	 */
+	newblock = newext->e_start++;
+	if (newext->e_num == 0) {
+		/* 
+		 * FIXME: if this may happen, then we have to handle
+		 * possible error and free allocated block
+		 */
+		printk("grow_indepth with zero blocks\n");
+		newblock = ext3_new_block(handle, inode,
+						newblock, 0, 0, &err);
+	} else
+		newext->e_num--;
+	
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		ext3_std_error(inode->i_sb, err);
+		return err;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext3_journal_get_create_access(handle, bh))) {
+		unlock_buffer(bh);
+		goto out;	
+	}
+
+	/* move top-level index/leaf into new block */
+	len = sizeof(struct ext3_extent_header) +
+		sizeof(struct ext3_extent) * curp->p_hdr->e_max;
+	EXT_ASSERT(len >= 0 && len < 4096);
+	memmove(bh->b_data, curp->p_hdr, len);
+
+	/* set size of new block */
+	neh = (struct ext3_extent_header *) bh->b_data;
+	neh->e_max = ext3_ext_space_block(inode);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext3_journal_dirty_metadata(handle, bh)))
+		goto out;
+
+	/* create index in new top-level index: num,max,pointer */
+	if ((err = ext3_ext_get_access(handle, inode, curp)))
+		goto out;
+
+	curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode);
+	curp->p_hdr->e_num = 1;
+	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+	curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block;
+	curp->p_idx->e_leaf = newblock;
+
+	neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
+	fidx = EXT_FIRST_INDEX(neh);
+	ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n",
+			neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); 
+
+	EXT3_I(inode)->i_depth++;
+	err = ext3_ext_dirty(handle, inode, curp);
+out:
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * routine finds empty index and adds new leaf. if no free index found
+ * then it requests in-depth growing
+ */
+static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path,
+					struct ext3_extent *newext)
+{
+	long newblock = newext->e_start;
+	struct ext3_ext_path *curp;
+	int depth, i, err = 0;
+
+repeat:
+	i = depth = EXT3_I(inode)->i_depth;
+	
+	/* walk up to the tree and look for free index entry */
+	curp = path + depth;
+	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+		i--;
+		curp--;
+	}
+
+	/* we use already allocated block for index block
+	 * so, subsequent data blocks should be contigoues */
+	if (EXT_HAS_FREE_INDEX(curp)) {
+		/* if we found index with free entry, then use that
+		 * entry: create all needed subtree and add new leaf */
+		err = ext3_ext_split(handle, inode, path, newext, i);
+
+		/* refill path */
+		ext3_ext_drop_refs(inode, path);
+		path = ext3_ext_find_extent(inode, newext->e_block, path);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+	} else {
+		/* tree is full, time to grow in depth */
+		err = ext3_ext_grow_indepth(handle, inode, path, newext);
+
+		/* refill path */
+		ext3_ext_drop_refs(inode, path);
+		path = ext3_ext_find_extent(inode, newext->e_block, path);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+	
+		/*
+		 * only first (depth 0 -> 1) produces free space
+		 * in all other cases we have to split growed tree
+		 */
+		depth = EXT3_I(inode)->i_depth;
+		if (path[depth].p_hdr->e_num == path[depth].p_hdr->e_max) {
+			/* now we need split */
+			goto repeat;
+		}
+	}
+
+	if (err)
+		return err;
+
+	/*
+	 * probably we've used some blocks from extent
+	 * let's allocate new block for it
+	 */
+	if (newext->e_num == 0 && !err) {
+		newext->e_start =
+			ext3_new_block(handle, inode, newblock,
+					0, 0, &err);
+		if (newext->e_start != 0)
+			newext->e_num = 1;
+	}
+
+	return 0;
+}
+
+/*
+ * returns next allocated block or 0xffffffff
+ * NOTE: it consider block number from index entry as
+ * allocated block. thus, index entries have to be consistent
+ * with leafs
+ */
+static inline unsigned ext3_ext_next_allocated_block(struct inode *inode,
+                                               struct ext3_ext_path *path)
+{
+	int depth;
+
+	EXT_ASSERT(path != NULL);
+	depth = path->p_depth;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0xffffffff;
+
+	/* FIXME: what if index isn't full ?! */
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			/* leaf */
+			if (path[depth].p_ext !=
+					EXT_LAST_EXTENT(path[depth].p_hdr))
+				return path[depth].p_ext[1].e_block;
+		} else {
+			/* index */
+			if (path[depth].p_idx !=
+					EXT_LAST_INDEX(path[depth].p_hdr))
+				return path[depth].p_idx[1].e_block;
+		}
+		depth--;        
+	}
+
+	return 0xffffffff;
+}
+
+/*
+ * returns first allocated block from next leaf or 0xffffffff
+ */
+static unsigned ext3_ext_next_leaf_block(struct inode *inode,
+                                               struct ext3_ext_path *path)
+{
+	int depth;
+
+	EXT_ASSERT(path != NULL);
+	depth = path->p_depth;
+
+	/* zero-tree has no leaf blocks at all */
+	if (depth == 0)
+		return 0xffffffff;
+
+	/* go to index block */
+	depth--;
+	
+	while (depth >= 0) {
+		if (path[depth].p_idx !=
+				EXT_LAST_INDEX(path[depth].p_hdr))
+			return path[depth].p_idx[1].e_block;
+		depth--;        
+	}
+
+	return 0xffffffff;
+}
+
+/*
+ * if leaf gets modified and modified extent is first in the leaf
+ * then we have to correct all indexes above
+ * TODO: do we need to correct tree in all cases?
+ */
+int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	int depth = EXT3_I(inode)->i_depth;	
+	struct ext3_extent_header *eh;
+	struct ext3_extent *ex;
+	sector_t border;
+	int k, err = 0;
+	
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	EXT_ASSERT(ex);
+	EXT_ASSERT(eh);
+	
+	if (depth == 0) {
+		/* there is no tree at all */
+		return 0;
+	}
+	
+	if (ex != EXT_FIRST_EXTENT(eh)) {
+		/* we correct tree if first leaf got modified only */
+		return 0;
+	}
+	
+	/*
+	 * TODO: we need correction if border is smaller then current one
+	 */
+	k = depth - 1;
+	border = path[depth].p_ext->e_block;
+	if ((err = ext3_ext_get_access(handle, inode, path + k)))
+		return err;
+	path[k].p_idx->e_block = border;
+	if ((err = ext3_ext_dirty(handle, inode, path + k)))
+		return err;
+
+	while (k--) {
+		/* change all left-side indexes */
+		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+			break;
+		if ((err = ext3_ext_get_access(handle, inode, path + k)))
+			break;
+		path[k].p_idx->e_block = border;
+		if ((err = ext3_ext_dirty(handle, inode, path + k)))
+			break;
+	}
+
+	return err;
+}
+
+/*
+ * this routine tries to merge requsted extent into the existing
+ * extent or inserts requested extent as new one into the tree,
+ * creating new leaf in no-space case
+ */
+int ext3_ext_insert_extent(handle_t *handle, struct inode *inode,
+				struct ext3_ext_path *path,
+				struct ext3_extent *newext)
+{
+	int depth, len;
+	struct ext3_extent_header * eh;
+	struct ext3_extent *ex;
+	struct ext3_extent *nearex; /* nearest extent */
+	struct ext3_ext_path *npath = NULL;
+	int err;
+
+	depth = EXT3_I(inode)->i_depth;	
+	if ((ex = path[depth].p_ext)) {
+		/* try to insert block into found extent and return */
+		if (ex->e_block + ex->e_num == newext->e_block &&
+				ex->e_start + ex->e_num == newext->e_start) {
+#ifdef AGRESSIVE_TEST
+			if (ex->e_num >= 2)
+				goto repeat;
+#endif
+			if ((err = ext3_ext_get_access(handle, inode,
+							path + depth)))
+				return err;
+			ext_debug(inode, "append %d block to %d:%d (from %d)\n",
+					newext->e_num, ex->e_block, ex->e_num,
+					ex->e_start);
+			ex->e_num += newext->e_num;
+			err = ext3_ext_dirty(handle, inode, path + depth);
+			return err;
+		}
+	}
+
+repeat:
+	depth = EXT3_I(inode)->i_depth;	
+	eh = path[depth].p_hdr;
+	if (eh->e_num == eh->e_max) {
+		/* probably next leaf has space for us? */
+		int next = ext3_ext_next_leaf_block(inode, path);
+		if (next != 0xffffffff) {
+			ext_debug(inode, "next leaf block - %d\n", next);
+			EXT_ASSERT(!npath);
+			npath = ext3_ext_find_extent(inode, next, NULL);
+			if (IS_ERR(npath))
+				return PTR_ERR(npath);
+			EXT_ASSERT(npath->p_depth == path->p_depth);
+			eh = npath[depth].p_hdr;
+			if (eh->e_num < eh->e_max) {
+				ext_debug(inode,
+						"next leaf has free ext(%d)\n",
+						eh->e_num);
+				path = npath;
+				goto repeat;
+			}
+			ext_debug(inode, "next leaf hasno free space(%d,%d)\n",
+					eh->e_num, eh->e_max);
+		}
+		/*
+		 * there is no free space in found leaf
+		 * we're gonna add new leaf in the tree
+		 */
+		err = ext3_ext_create_new_leaf(handle, inode, path, newext);
+		if (err)
+			goto cleanup;
+		goto repeat;
+	}
+
+	nearex = path[depth].p_ext;
+
+	if ((err = ext3_ext_get_access(handle, inode, path + depth)))
+		goto cleanup;
+
+	if (!nearex) {
+		/* there is no extent in this leaf, create first one */
+		ext_debug(inode, "first extent in the leaf: %d:%d:%d\n",
+				newext->e_block, newext->e_start,
+				newext->e_num);
+		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+	} else if (newext->e_block > nearex->e_block) {
+		EXT_ASSERT(newext->e_block != nearex->e_block);
+		if (nearex != EXT_LAST_EXTENT(eh)) {
+			len = EXT_MAX_EXTENT(eh) - nearex;
+			len = (len - 1) * sizeof(struct ext3_extent);
+			len = len < 0 ? 0 : len;
+			ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, "
+					"move %d from 0x%p to 0x%p\n",
+					newext->e_block, newext->e_start, newext->e_num,
+					nearex, len, nearex + 1, nearex + 2);
+			ext3_ext_check_boundary(inode, path + depth, nearex + 2, len);
+			memmove(nearex + 2, nearex + 1, len);
+		}
+		path[depth].p_ext = nearex + 1;
+	} else {
+		EXT_ASSERT(newext->e_block != nearex->e_block);
+		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				newext->e_block, newext->e_start, newext->e_num,
+				nearex, len, nearex + 1, nearex + 2);
+		
+		memmove(nearex + 1, nearex, len);
+		path[depth].p_ext = nearex;
+	}
+
+	if (!err) {
+		eh->e_num++;
+		nearex = path[depth].p_ext;
+		nearex->e_block = newext->e_block;
+		nearex->e_start = newext->e_start;
+		nearex->e_num = newext->e_num;
+		EXT_ASSERT(nearex->e_num < EXT3_BLOCKS_PER_GROUP(inode->i_sb) &&
+				nearex->e_num > 0);
+
+		/* time to correct all indexes above */
+		err = ext3_ext_correct_indexes(handle, inode, path);
+	}
+
+	err = ext3_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+	if (npath) {
+		ext3_ext_drop_refs(inode, npath);
+		kfree(npath);
+	}
+		
+	return err;
+}
+
+int ext3_ext_get_block(handle_t *handle, struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create,
+			int extend_disksize)
+{
+	struct ext3_ext_path *path = NULL;
+	int depth = EXT3_I(inode)->i_depth;
+	struct ext3_extent newex;
+	struct ext3_extent *ex;
+	int goal, newblock, err = 0;
+
+	ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n",
+			(int) iblock, (unsigned) inode->i_ino, bh_result);
+	clear_buffer_new(bh_result);
+
+	down(&EXT3_I(inode)->truncate_sem);
+
+	/* check in cache */
+	newblock = ext3_ext_is_in_cache(inode, iblock);
+	if (newblock)
+		goto out;
+
+	/* find extent for this block */
+	path = ext3_ext_find_extent(inode, iblock, NULL);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		goto out2;
+	}
+
+	if ((ex = path[depth].p_ext)) {
+		/* if found exent covers block, simple return it */
+		if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) {
+			newblock = iblock - ex->e_block + ex->e_start;
+			ext_debug(inode, "%d fit into %d:%d -> %d\n",
+					(int) iblock, ex->e_block, ex->e_num,
+					newblock);
+			ext3_ext_put_in_cache(inode, ex);
+			goto out;
+		}
+	}
+
+	/*
+	 * we couldn't try to create block if create flag is zero 
+	 */
+	if (!create) 
+		goto out2;
+
+	/* allocate new block */
+	goal = ext3_ext_find_goal(inode, path);
+	newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
+	if (!newblock)
+		goto out2;
+	ext_debug(inode, "allocate new block: goal %d, found %d\n",
+			goal, newblock);
+
+	/* try to insert new extent into found leaf and return */
+	newex.e_block = iblock;
+	newex.e_start = newblock;
+	newex.e_num = 1;
+	err = ext3_ext_insert_extent(handle, inode, path, &newex);
+	if (err)
+		goto out2;
+	
+	/* previous routine could use block we allocated */
+	newblock = newex.e_start;
+	set_buffer_new(bh_result);
+
+	ext3_ext_put_in_cache(inode, &newex);
+out:
+	ext3_ext_show_leaf(inode, path);
+	map_bh(bh_result, inode->i_sb, newblock);
+out2:
+	if (path) {
+		ext3_ext_drop_refs(inode, path);
+		kfree(path);
+	}
+	up(&EXT3_I(inode)->truncate_sem);
+
+	return err;	
+}
+
+/*
+ * returns 1 if current index have to be freed (even partial)
+ */
+static int ext3_ext_more_to_truncate(struct inode *inode,
+				struct ext3_ext_path *path)
+{
+	EXT_ASSERT(path->p_idx);
+
+	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+		return 0;
+
+	/*
+	 * if truncate on deeper level happened it it wasn't partial
+	 * so we have to consider current index for truncation
+	 */
+	if (path->p_hdr->e_num == path->p_block)
+		return 0;
+
+	/*
+	 * put actual number of indexes to know is this number got
+	 * changed at the next iteration
+	 */
+	path->p_block = path->p_hdr->e_num;
+	
+	return 1;
+}
+
+/*
+ * routine removes index from the index block
+ * it's used in truncate case only. thus all requests are for
+ * last index in the block only
+ */
+int ext3_ext_remove_index(handle_t *handle, struct inode *inode,
+					struct ext3_ext_path *path)
+{
+	struct buffer_head *bh;
+	int err;
+	
+	/* free index block */
+	path--;
+	EXT_ASSERT(path->p_hdr->e_num);
+	if ((err = ext3_ext_get_access(handle, inode, path)))
+		return err;
+	path->p_hdr->e_num--;
+	if ((err = ext3_ext_dirty(handle, inode, path)))
+		return err;
+	bh = sb_find_get_block(inode->i_sb, path->p_idx->e_leaf);
+	ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf);
+	ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1);
+
+	ext_debug(inode, "index is empty, remove it, free block %d\n",
+			path->p_idx->e_leaf);
+	return err;
+}
+
+/*
+ * returns 1 if current extent needs to be freed (even partial)
+ * instead, returns 0
+ */
+int ext3_ext_more_leaves_to_truncate(struct inode *inode,
+					struct ext3_ext_path *path)
+{
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	struct ext3_extent *ex = path->p_ext;
+	int last_block; 
+
+	EXT_ASSERT(ex);
+
+	/* is there leave in the current leaf? */
+	if (ex < EXT_FIRST_EXTENT(path->p_hdr))
+		return 0;
+	
+	last_block = (inode->i_size + blocksize-1)
+			>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (last_block >= ex->e_block + ex->e_num)
+		return 0;
+
+	/* seems it extent have to be freed */
+	return 1;
+}
+
+handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
+{
+	int err;
+
+	if (handle->h_buffer_credits > needed)
+		return handle;
+	if (!ext3_journal_extend(handle, needed))
+		return handle;
+	err = ext3_journal_restart(handle, needed);
+	
+	return handle;
+}
+
+/*
+ * this routine calculate max number of blocks to be modified
+ * while freeing extent and is intended to be used in truncate path
+ */
+static int ext3_ext_calc_credits(struct inode *inode,
+					struct ext3_ext_path *path,
+					int num)
+{
+	int depth = EXT3_I(inode)->i_depth;
+	int needed;
+	
+	/*
+	 * extent couldn't cross group, so we will modify
+	 * single bitmap block and single group descriptor
+	 */
+	needed = 2;
+
+	/*
+	 * if this is last extent in a leaf, then we have to
+	 * free leaf block and remove pointer from index above.
+	 * that pointer could be last in index block, so we'll
+	 * have to remove it too. this way we could modify/free
+	 * the whole path + root index (inode stored) will be
+	 * modified
+	 */
+	if (!path || (num == path->p_ext->e_num &&
+				path->p_ext == EXT_FIRST_EXTENT(path->p_hdr)))
+		needed += (depth * EXT3_ALLOC_NEEDED) + 1;
+
+	return needed;
+}
+
+/*
+ * core of the truncate procedure:
+ * - calculated what part of each extent in the requested leaf
+ *   need to be freed
+ * - frees and forgets these blocks
+ *
+ * TODO: we could optimize and free several extents during
+ *       single journal_restart()-journal_restart() cycle
+ */
+static int ext3_ext_truncate_leaf(handle_t *handle,
+					struct inode *inode,
+					struct ext3_ext_path *path,
+					int depth)
+{
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	int last_block; 
+	int i, err = 0, sf, num;
+
+	ext_debug(inode, "level %d - leaf\n", depth);
+	if (!path->p_hdr)
+		path->p_hdr =
+			(struct ext3_extent_header *) path->p_bh->b_data;
+
+	EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max);
+	
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+	path->p_ext = EXT_LAST_EXTENT(path->p_hdr);
+	while (ext3_ext_more_leaves_to_truncate(inode, path)) {
+
+		/* what part of extent have to be freed? */
+		sf = last_block > path->p_ext->e_block ?
+			last_block : path->p_ext->e_block;
+
+		/* number of blocks from extent to be freed */
+		num = path->p_ext->e_block + path->p_ext->e_num - sf;
+
+		/* calc physical first physical block to be freed */
+		sf = path->p_ext->e_start + (sf - path->p_ext->e_block);
+
+		i = ext3_ext_calc_credits(inode, path, num);
+		handle = ext3_ext_journal_restart(handle, i);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		
+		ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n",
+				path->p_ext->e_block, path->p_ext->e_start,
+				path->p_ext->e_num, sf, num);
+		for (i = 0; i < num; i++) {
+			struct buffer_head *bh =
+				sb_find_get_block(inode->i_sb, sf + i);
+			ext3_forget(handle, 0, inode, bh, sf + i);
+		}
+		ext3_free_blocks(handle, inode, sf, num);
+
+		/* collect extents usage stats */
+		spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock);
+		EXT3_SB(inode->i_sb)->s_ext_extents++;
+		EXT3_SB(inode->i_sb)->s_ext_blocks += num;
+		spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock);
+
+		/* reduce extent */
+		if ((err = ext3_ext_get_access(handle, inode, path)))
+			return err;
+		path->p_ext->e_num -= num;
+		if (path->p_ext->e_num == 0)
+			path->p_hdr->e_num--;
+		if ((err = ext3_ext_dirty(handle, inode, path)))
+			return err;
+
+		path->p_ext--;
+	}
+	
+	/* if this leaf is free, then we should
+	 * remove it from index block above */
+	if (path->p_hdr->e_num == 0 && depth > 0) 
+		err = ext3_ext_remove_index(handle, inode, path);
+
+	return err;
+}
+
+static void ext3_ext_collect_stats(struct inode *inode)
+{
+	int depth;
+	
+	/* skip inodes with old good bitmap */
+	if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
+		return;
+	
+	/* collect on full truncate only */
+	if (inode->i_size)
+		return;
+
+	depth = EXT3_I(inode)->i_depth;
+	if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth)
+		 EXT3_SB(inode->i_sb)->s_ext_mindepth = depth;
+	if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth)
+		 EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth;
+	EXT3_SB(inode->i_sb)->s_ext_sum += depth;
+	EXT3_SB(inode->i_sb)->s_ext_count++;
+	
+}
+
+void ext3_ext_truncate(struct inode * inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct ext3_ext_path *path;
+	struct page * page;
+	handle_t *handle;
+	int i, depth, err = 0;
+
+	ext3_ext_collect_stats(inode);
+	ext3_ext_invalidate_cache(inode);
+
+	/*
+	 * We have to lock the EOF page here, because lock_page() nests
+	 * outside journal_start().
+	 */
+	if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* Block boundary? Nothing to do */
+		page = NULL;
+	} else {
+		page = grab_cache_page(mapping,
+				inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page)
+			return;
+	}
+
+	/*
+	 * probably first extent we're gonna free will be last in block
+	 */
+	i = ext3_ext_calc_credits(inode, NULL, 0);
+	handle = ext3_journal_start(inode, i);
+	if (IS_ERR(handle)) {
+		if (page) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+		return;
+	}
+
+	if (page)
+		ext3_block_truncate_page(handle, page, mapping, inode->i_size);
+
+	down(&EXT3_I(inode)->truncate_sem);
+	/* 
+	 * TODO: optimization is possible here
+	 * probably we need not scaning at all,
+	 * because page truncation is enough
+	 */
+	if (ext3_orphan_add(handle, inode))
+		goto out_stop;
+
+	/* we have to know where to truncate from in crash case */
+	EXT3_I(inode)->i_disksize = inode->i_size;
+	ext3_mark_inode_dirty(handle, inode);
+
+	/*
+	 * we start scanning from right side freeing all the blocks
+	 * after i_size and walking into the deep
+	 */
+	i = 0;
+	depth = EXT3_I(inode)->i_depth;
+	path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
+	if (IS_ERR(path)) {
+		ext3_error(inode->i_sb, "ext3_ext_truncate",
+				"Can't allocate path array");
+		goto out_stop;
+	}
+	memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
+
+	path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data;
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			/* this is leaf block */
+			err = ext3_ext_truncate_leaf(handle, inode,
+							path + i, i);
+			/* root level have p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			i--;
+			continue;
+		}
+		
+		/* this is index block */
+		if (!path[i].p_hdr) {
+			path[i].p_hdr =
+				(struct ext3_extent_header *) path[i].p_bh->b_data;
+			ext_debug(inode, "initialize header\n");
+		}
+
+		EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max);
+		
+		if (!path[i].p_idx) {
+			/* this level hasn't touched yet */
+			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+			path[i].p_block = path[i].p_hdr->e_num + 1;
+			ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
+					path[i].p_hdr, path[i].p_hdr->e_num);
+		} else {
+			/* we've already was here, see at next index */
+			path[i].p_idx--;
+		}
+
+		ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
+				i, EXT_FIRST_INDEX(path[i].p_hdr),
+				path[i].p_idx);
+		if (ext3_ext_more_to_truncate(inode, path + i)) {
+			/* go to the next level */
+			ext_debug(inode, "move to level %d (block %d)\n", i+1,
+					path[i].p_idx->e_leaf);
+			memset(path + i + 1, 0, sizeof(*path));
+			path[i+1].p_bh = sb_bread(inode->i_sb,
+							path[i].p_idx->e_leaf);
+			if (!path[i+1].p_bh) {
+				/* should we reset i_size? */
+				err = -EIO;
+				break;
+			}
+			i++;
+		} else {
+			/* we finish processing this index, go up */
+			if (path[i].p_hdr->e_num == 0 && i > 0) {
+				/* index is empty, remove it
+				 * handle must be already prepared by the
+				 * truncate_leaf()
+				 */
+				err = ext3_ext_remove_index(handle, inode,
+								path + i);
+			}
+			/* root level have p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			i--;
+			ext_debug(inode, "return to level %d\n", i);
+		}
+	}
+
+	/* TODO: flexible tree reduction should be here */
+	if (path->p_hdr->e_num == 0) {
+		/*
+		 * truncate to zero freed all the tree
+		 * so, we need to correct i_depth
+		 */
+		EXT3_I(inode)->i_depth = 0;
+		path->p_hdr->e_max = 0;
+		ext3_mark_inode_dirty(handle, inode);
+	}
+
+	kfree(path);
+
+	/* In a multi-transaction truncate, we only make the final
+	 * transaction synchronous */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext3_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(handle, inode);
+
+	up(&EXT3_I(inode)->truncate_sem);
+	ext3_journal_stop(handle);
+}
+
+/*
+ * this routine calculate max number of blocks we could modify
+ * in order to allocate new block for an inode
+ */
+int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
+{
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	int depth = ei->i_depth + 1;
+	int needed;
+	
+	/*
+	 * the worste case we're expecting is creation of the
+	 * new root (growing in depth) with index splitting
+	 * for splitting we have to consider depth + 1 because
+	 * previous growing could increase it
+	 */
+
+	/* 
+	 * growing in depth:
+	 * block allocation + new root + old root
+	 */
+	needed = EXT3_ALLOC_NEEDED + 2;
+
+	/* index split. we may need:
+	 *   allocate intermediate indexes and new leaf
+	 *   change two blocks at each level, but root
+	 *   modify root block (inode)
+	 */
+	needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
+
+	/* caller want to allocate num blocks */
+	needed *= num;
+	
+#ifdef CONFIG_QUOTA
+	/* 
+	 * FIXME: real calculation should be here
+	 * it depends on blockmap format of qouta file
+	 */
+	needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
+#endif
+
+	return needed;
+}
+
+/*
+ * called at mount time
+ */
+void ext3_ext_init(struct super_block *sb)
+{
+	/*
+	 * possible initialization would be here
+	 */
+
+	if (test_opt(sb, EXTENTS))
+		printk("EXT3-fs: file extents enabled\n");
+	spin_lock_init(&EXT3_SB(sb)->s_ext_lock);
+}
+
+/*
+ * called at umount time
+ */
+void ext3_ext_release(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	/* show collected stats */
+	if (sbi->s_ext_count && sbi->s_ext_extents)
+		printk("EXT3-fs: min depth - %d, max depth - %d, "
+				"ave. depth - %d, ave. blocks/extent - %d\n",
+				sbi->s_ext_mindepth,
+				sbi->s_ext_maxdepth,
+				sbi->s_ext_sum / sbi->s_ext_count,
+				sbi->s_ext_blocks / sbi->s_ext_extents);
+}
+
Index: linux-2.6.0-test7/fs/ext3/ialloc.c
===================================================================
--- linux-2.6.0-test7.orig/fs/ext3/ialloc.c	2003-06-24 18:05:25.000000000 +0400
+++ linux-2.6.0-test7/fs/ext3/ialloc.c	2003-10-11 15:59:36.000000000 +0400
@@ -600,6 +600,8 @@
 		DQUOT_FREE_INODE(inode);
 		goto fail2;
   	}
+	if (test_opt(sb, EXTENTS))
+		EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
 	err = ext3_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext3_std_error(sb, err);
Index: linux-2.6.0-test7/fs/ext3/inode.c
===================================================================
--- linux-2.6.0-test7.orig/fs/ext3/inode.c	2003-10-10 16:36:37.000000000 +0400
+++ linux-2.6.0-test7/fs/ext3/inode.c	2003-10-11 15:59:36.000000000 +0400
@@ -858,6 +858,15 @@
 	goto reread;
 }
 
+static inline int
+ext3_get_block_wrap(handle_t *handle, struct inode *inode, sector_t block,
+		struct buffer_head *bh, int create, int extend_disksize)
+{
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_get_block(handle, inode, block, bh, create, 1);
+	return ext3_get_block_handle(handle, inode, block, bh, create, 1);
+}
+
 static int ext3_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -868,7 +877,7 @@
 		handle = ext3_journal_current_handle();
 		J_ASSERT(handle != 0);
 	}
-	ret = ext3_get_block_handle(handle, inode, iblock,
+	ret = ext3_get_block_wrap(handle, inode, iblock,
 				bh_result, create, 1);
 	return ret;
 }
@@ -895,7 +904,7 @@
 		}
 	}
 	if (ret == 0)
-		ret = ext3_get_block_handle(handle, inode, iblock,
+		ret = ext3_get_block_wrap(handle, inode, iblock,
 					bh_result, create, 0);
 	if (ret == 0)
 		bh_result->b_size = (1 << inode->i_blkbits);
@@ -917,7 +926,7 @@
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
+	*errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
 	if (!*errp && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
@@ -1659,7 +1668,7 @@
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
+int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
@@ -2141,6 +2150,9 @@
 
 	ext3_discard_prealloc(inode);
 
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_truncate(inode);
+
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
 	 * outside journal_start().
@@ -2536,6 +2548,7 @@
 	ei->i_prealloc_count = 0;
 #endif
 	ei->i_block_group = iloc.block_group;
+	ei->i_depth = raw_inode->osd2.linux2.l_i_depth;
 
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
@@ -2636,6 +2649,7 @@
 	raw_inode->i_frag = ei->i_frag_no;
 	raw_inode->i_fsize = ei->i_frag_size;
 #endif
+ 	raw_inode->osd2.linux2.l_i_depth = ei->i_depth;
 	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
@@ -2848,6 +2862,9 @@
 	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 
+	if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
+		return ext3_ext_writepage_trans_blocks(inode, bpp);
+
 	if (ext3_should_journal_data(inode))
 		ret = 3 * (bpp + indirects) + 2;
 	else
Index: linux-2.6.0-test7/fs/ext3/Makefile
===================================================================
--- linux-2.6.0-test7.orig/fs/ext3/Makefile	2003-09-19 18:00:24.000000000 +0400
+++ linux-2.6.0-test7/fs/ext3/Makefile	2003-10-11 15:59:36.000000000 +0400
@@ -5,7 +5,7 @@
 obj-$(CONFIG_EXT3_FS) += ext3.o
 
 ext3-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-	   ioctl.o namei.o super.o symlink.o hash.o
+	   ioctl.o namei.o super.o symlink.o hash.o extents.o
 
 ext3-$(CONFIG_EXT3_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
 ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
Index: linux-2.6.0-test7/fs/ext3/super.c
===================================================================
--- linux-2.6.0-test7.orig/fs/ext3/super.c	2003-10-10 16:02:22.000000000 +0400
+++ linux-2.6.0-test7/fs/ext3/super.c	2003-10-11 16:16:55.000000000 +0400
@@ -385,6 +385,7 @@
 	struct ext3_super_block *es = sbi->s_es;
 	int i;
 
+	ext3_ext_release(sb);
 	ext3_xattr_put_super(sb);
 	journal_destroy(sbi->s_journal);
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -442,6 +443,8 @@
 	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
 #endif
 	ei->vfs_inode.i_version = 1;
+	ei->i_depth = 0;
+	ei->i_ce_num = 0;
 	return &ei->vfs_inode;
 }
 
@@ -534,7 +537,7 @@
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
 	Opt_commit, Opt_journal_update, Opt_journal_inum,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_ignore, Opt_err,
+	Opt_ignore, Opt_err, Opt_extents, Opt_extdebug
 };
 
 static match_table_t tokens = {
@@ -573,6 +576,8 @@
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
 	{Opt_ignore, "usrquota"},
+	{Opt_extents, "extents"},
+	{Opt_extdebug, "extdebug"},
 	{Opt_err, NULL}
 };
 
@@ -760,6 +765,12 @@
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_extents:
+			set_opt(sbi->s_mount_opt, EXTENTS);
+			break;
+		case Opt_extdebug:
+			set_opt(sbi->s_mount_opt, EXTDEBUG);
+			break;
 		case Opt_ignore:
 			break;
 		default:
@@ -1390,6 +1401,8 @@
 	percpu_counter_mod(&sbi->s_dirs_counter,
 		ext3_count_dirs(sb));
 
+	ext3_ext_init(sb);
+
 	return 0;
 
 failed_mount3:
Index: linux-2.6.0-test7/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.0-test7.orig/include/linux/ext3_fs.h	2003-09-19 18:01:10.000000000 +0400
+++ linux-2.6.0-test7/include/linux/ext3_fs.h	2003-10-11 15:59:36.000000000 +0400
@@ -186,6 +186,7 @@
 #define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
+#define EXT3_EXTENTS_FL			0x00080000 /* Inode uses extents */
 
 #define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
 #define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
@@ -244,7 +245,7 @@
 		struct {
 			__u8	l_i_frag;	/* Fragment number */
 			__u8	l_i_fsize;	/* Fragment size */
-			__u16	i_pad1;
+			__u16	l_i_depth;
 			__u16	l_i_uid_high;	/* these 2 fields    */
 			__u16	l_i_gid_high;	/* were reserved2[0] */
 			__u32	l_i_reserved2;
@@ -324,6 +325,8 @@
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
 #define EXT3_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT3_MOUNT_EXTENTS		0x10000	/* Extents support */
+#define EXT3_MOUNT_EXTDEBUG		0x20000	/* Extents debug */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -733,6 +736,11 @@
 extern void ext3_truncate (struct inode *);
 extern void ext3_set_inode_flags(struct inode *);
 extern void ext3_set_aops(struct inode *inode);
+extern int ext3_block_truncate_page(handle_t *handle, struct page *page,
+					struct address_space *mapping, loff_t from);
+extern int ext3_forget(handle_t *handle, int is_metadata,
+		       struct inode *inode, struct buffer_head *bh,
+		       int blocknr);
 
 /* ioctl.c */
 extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
@@ -789,6 +797,13 @@
 extern struct inode_operations ext3_symlink_inode_operations;
 extern struct inode_operations ext3_fast_symlink_inode_operations;
 
+/* extents.c */
+extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext3_ext_get_block(handle_t *, struct inode *, sector_t,
+				struct buffer_head *, int, int);
+extern void ext3_ext_truncate(struct inode *);
+extern void ext3_ext_init(struct super_block *);
+extern void ext3_ext_release(struct super_block *);
 
 #endif	/* __KERNEL__ */
 
Index: linux-2.6.0-test7/include/linux/ext3_fs_i.h
===================================================================
--- linux-2.6.0-test7.orig/include/linux/ext3_fs_i.h	2003-10-10 16:02:24.000000000 +0400
+++ linux-2.6.0-test7/include/linux/ext3_fs_i.h	2003-10-11 16:00:31.000000000 +0400
@@ -108,6 +108,14 @@
 	 */
 	struct semaphore truncate_sem;
 	struct inode vfs_inode;
+
+	/* extents-related data */
+	__u16 i_depth;
+
+	/* last found extent */
+	__u32 i_ce_block;
+	__u32 i_ce_start;
+	__u16 i_ce_num;		/* 0 - invalided cache */
 };
 
 #endif	/* _LINUX_EXT3_FS_I */
Index: linux-2.6.0-test7/include/linux/ext3_fs_sb.h
===================================================================
--- linux-2.6.0-test7.orig/include/linux/ext3_fs_sb.h	2003-06-24 18:05:26.000000000 +0400
+++ linux-2.6.0-test7/include/linux/ext3_fs_sb.h	2003-10-11 15:59:36.000000000 +0400
@@ -68,6 +68,16 @@
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
 	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
 #endif
+
+	/* extents */
+	int s_ext_debug;
+	int s_ext_mindepth;
+	int s_ext_maxdepth;
+	int s_ext_sum;
+	int s_ext_count;
+	spinlock_t s_ext_lock;
+	int s_ext_extents;
+	int s_ext_blocks;
 };
 
 #endif	/* _LINUX_EXT3_FS_SB */

%diffstat
 fs/ext3/Makefile           |    2 
 fs/ext3/extents.c          | 1766 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext3/ialloc.c           |    2 
 fs/ext3/inode.c            |   25 
 fs/ext3/super.c            |   15 
 include/linux/ext3_fs.h    |   17 
 include/linux/ext3_fs_i.h  |    8 
 include/linux/ext3_fs_sb.h |   10 
 8 files changed, 1838 insertions(+), 7 deletions(-)


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-13 18:27 [PATCH] EXT3 extents against 2.6.0-test7 Alex Tomas
@ 2003-10-13 21:39 ` Ed Sweetman
       [not found]   ` <20031014212359.42243025.alex@clusterfs.com>
  0 siblings, 1 reply; 9+ messages in thread
From: Ed Sweetman @ 2003-10-13 21:39 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, Stephen Hemminger

Alex Tomas wrote:
> changes since last publication:
> 1) few bugs fixed: they caused extents tree corruption
> 2) several asserts added
> 3) binary search used to find an extent
> 4) last found entry is cached: this allows to skip tree
>    traversal and saves cpu a bit
> 5) truncate_sem is used to serialize get_block()/trucate()
> 
> 
> with best wishes, Alex
> 
> 


I've been using extents since the patch was introduced.  I haven't seen 
any corruption when using it with large files (ie. multimedia only).  In 
short, has there been any progress with getting fsck support? I'm 100% 
in support of this patch going into mainline kernel as a non-default 
option.  It's perfect for partitions that deal with large files, but 
still want ext3's fs corruption protection.  I average 10700 blocks an 
extent and see an extreme increase in performance over non-extents ext3 
doing operations on files of the same size.   I guess it's about time to 
update.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
       [not found]   ` <20031014212359.42243025.alex@clusterfs.com>
@ 2003-10-17 19:32     ` Ed Sweetman
  2003-10-17 20:10       ` Alex Tomas
  0 siblings, 1 reply; 9+ messages in thread
From: Ed Sweetman @ 2003-10-17 19:32 UTC (permalink / raw)
  To: Alex Tomas, linux-kernel

kernel BUG at fs/ext3/extents.c:389!
invalid operand: 0000 [#1]
CPU:    0
EIP:    0060:[<c0198127>]    Not tainted
EFLAGS: 00010282
EIP is at ext3_ext_find_extent+0x277/0x570
eax: 00009ac6   ebx: e5257580   ecx: 00000000   edx: e2184940
esi: 00000000   edi: 00000000   ebp: 00000000   esp: ca6adddc
ds: 007b   es: 007b   ss: 0068
Process find (pid: 10373, threadinfo=ca6ac000 task=ca97b380)
Stack: 00000030 00000050 e7db0200 00458006 00000400 00000000 e2184940 
e5257580
        00000000 00000000 00000000 00000000 e5257614 e5257614 e5257580 
c0199f02
        e5257614 00000000 e2184940 e5257580 e5257614 c018fac1 e5257604 
00000000
Call Trace:
  [<c0199f02>] ext3_ext_get_block+0xb2/0x320
  [<c018fac1>] ext3_read_inode+0x221/0x2d0
  [<c016b74f>] d_splice_alias+0x4f/0x130
  [<c018d57d>] ext3_getblk+0x25d/0x2b0
  [<c018d603>] ext3_bread+0x33/0xb0
  [<c018a3e1>] ext3_readdir+0x141/0x4e0
  [<c0165a7a>] vfs_readdir+0x7a/0x80
  [<c0165db0>] filldir64+0x0/0x140
  [<c0165f5f>] sys_getdents64+0x6f/0xa9
  [<c0165db0>] filldir64+0x0/0x140
  [<c01092e7>] syscall_call+0x7/0xb

Code: 0f 0b 85 01 51 fe 2e c0 66 85 c0 0f 84 ee 00 00 00 8b 7c 24


I'm not sure why this is happening.  Perhaps due to these ext3 locking 
fixes that have been going into the kernel or what?

Upon rebooting my last kernel for the first time in a couple of weeks it 
crashed due to fs errors.  Now i'm getting this.  I only use extents on 
a couple non-system partitions, so if i lose anything it's not a huge 
deal but I'd like to find out why these errors are suddenly creeping up 
so Any other info that's needed just ask and i'll give it.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-17 19:32     ` Ed Sweetman
@ 2003-10-17 20:10       ` Alex Tomas
  2003-10-17 20:13         ` Ed Sweetman
  0 siblings, 1 reply; 9+ messages in thread
From: Alex Tomas @ 2003-10-17 20:10 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel

uhuh!

is it possible to know size of that directory ?

thanks!

On Fri, 17 Oct 2003 15:32:55 -0400
Ed Sweetman <ed.sweetman@wmich.edu> wrote:

> kernel BUG at fs/ext3/extents.c:389!
> invalid operand: 0000 [#1]
> CPU:    0
> EIP:    0060:[<c0198127>]    Not tainted
> EFLAGS: 00010282
> EIP is at ext3_ext_find_extent+0x277/0x570
> eax: 00009ac6   ebx: e5257580   ecx: 00000000   edx: e2184940
> esi: 00000000   edi: 00000000   ebp: 00000000   esp: ca6adddc
> ds: 007b   es: 007b   ss: 0068
> Process find (pid: 10373, threadinfo=ca6ac000 task=ca97b380)
> Stack: 00000030 00000050 e7db0200 00458006 00000400 00000000 e2184940 
> e5257580
>         00000000 00000000 00000000 00000000 e5257614 e5257614 e5257580 
> c0199f02
>         e5257614 00000000 e2184940 e5257580 e5257614 c018fac1 e5257604 
> 00000000
> Call Trace:
>   [<c0199f02>] ext3_ext_get_block+0xb2/0x320
>   [<c018fac1>] ext3_read_inode+0x221/0x2d0
>   [<c016b74f>] d_splice_alias+0x4f/0x130
>   [<c018d57d>] ext3_getblk+0x25d/0x2b0
>   [<c018d603>] ext3_bread+0x33/0xb0
>   [<c018a3e1>] ext3_readdir+0x141/0x4e0
>   [<c0165a7a>] vfs_readdir+0x7a/0x80
>   [<c0165db0>] filldir64+0x0/0x140
>   [<c0165f5f>] sys_getdents64+0x6f/0xa9
>   [<c0165db0>] filldir64+0x0/0x140
>   [<c01092e7>] syscall_call+0x7/0xb
> 
> Code: 0f 0b 85 01 51 fe 2e c0 66 85 c0 0f 84 ee 00 00 00 8b 7c 24
> 
> 
> I'm not sure why this is happening.  Perhaps due to these ext3 locking 
> fixes that have been going into the kernel or what?
> 
> Upon rebooting my last kernel for the first time in a couple of weeks it 
> crashed due to fs errors.  Now i'm getting this.  I only use extents on 
> a couple non-system partitions, so if i lose anything it's not a huge 
> deal but I'd like to find out why these errors are suddenly creeping up 
> so Any other info that's needed just ask and i'll give it.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-17 20:10       ` Alex Tomas
@ 2003-10-17 20:13         ` Ed Sweetman
  2003-10-17 20:41           ` Alex Tomas
  0 siblings, 1 reply; 9+ messages in thread
From: Ed Sweetman @ 2003-10-17 20:13 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel


How am i supposed to know which directory in the fs this corruption 
takes place in?   I can tell you the size of the partitions that have 
extents enabled, From that error message i dont even know which 
partition it was.  And judging by the dmesg last modified time, this 
happened 2 days ago

Filesystem            Size  Used Avail Use% Mounted on
                       101G   85G   16G  85% dir1
                        92G   38G   50G  44% dir2

Filesystem            Inodes   IUsed   IFree IUse% Mounted on
                       103040    3619   99421    4% dir1
                      12222464      61 12222403    1% dir2

Isn't it possible though that this happened in one of the non-extents 
enabled partitions though?  Since they still have the ability to read 
extents in files, they have to try and look them up every time for 
everything dont they?  Anyways, the two partitions above are the only 
ones i actually enable extents on.

Alex Tomas wrote:
> uhuh!
> 
> is it possible to know size of that directory ?
> 
> thanks!
> 
> On Fri, 17 Oct 2003 15:32:55 -0400
> Ed Sweetman <ed.sweetman@wmich.edu> wrote:
> 
> 
>>kernel BUG at fs/ext3/extents.c:389!
>>invalid operand: 0000 [#1]
>>CPU:    0
>>EIP:    0060:[<c0198127>]    Not tainted
>>EFLAGS: 00010282
>>EIP is at ext3_ext_find_extent+0x277/0x570
>>eax: 00009ac6   ebx: e5257580   ecx: 00000000   edx: e2184940
>>esi: 00000000   edi: 00000000   ebp: 00000000   esp: ca6adddc
>>ds: 007b   es: 007b   ss: 0068
>>Process find (pid: 10373, threadinfo=ca6ac000 task=ca97b380)
>>Stack: 00000030 00000050 e7db0200 00458006 00000400 00000000 e2184940 
>>e5257580
>>        00000000 00000000 00000000 00000000 e5257614 e5257614 e5257580 
>>c0199f02
>>        e5257614 00000000 e2184940 e5257580 e5257614 c018fac1 e5257604 
>>00000000
>>Call Trace:
>>  [<c0199f02>] ext3_ext_get_block+0xb2/0x320
>>  [<c018fac1>] ext3_read_inode+0x221/0x2d0
>>  [<c016b74f>] d_splice_alias+0x4f/0x130
>>  [<c018d57d>] ext3_getblk+0x25d/0x2b0
>>  [<c018d603>] ext3_bread+0x33/0xb0
>>  [<c018a3e1>] ext3_readdir+0x141/0x4e0
>>  [<c0165a7a>] vfs_readdir+0x7a/0x80
>>  [<c0165db0>] filldir64+0x0/0x140
>>  [<c0165f5f>] sys_getdents64+0x6f/0xa9
>>  [<c0165db0>] filldir64+0x0/0x140
>>  [<c01092e7>] syscall_call+0x7/0xb
>>
>>Code: 0f 0b 85 01 51 fe 2e c0 66 85 c0 0f 84 ee 00 00 00 8b 7c 24
>>
>>
>>I'm not sure why this is happening.  Perhaps due to these ext3 locking 
>>fixes that have been going into the kernel or what?
>>
>>Upon rebooting my last kernel for the first time in a couple of weeks it 
>>crashed due to fs errors.  Now i'm getting this.  I only use extents on 
>>a couple non-system partitions, so if i lose anything it's not a huge 
>>deal but I'd like to find out why these errors are suddenly creeping up 
>>so Any other info that's needed just ask and i'll give it.
> 
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-17 20:13         ` Ed Sweetman
@ 2003-10-17 20:41           ` Alex Tomas
  2003-10-17 21:22             ` Ed Sweetman
  0 siblings, 1 reply; 9+ messages in thread
From: Alex Tomas @ 2003-10-17 20:41 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel

On Fri, 17 Oct 2003 16:13:51 -0400
Ed Sweetman <ed.sweetman@wmich.edu> wrote:

> How am i supposed to know which directory in the fs this corruption 
> takes place in?   I can tell you the size of the partitions that have 
> extents enabled, From that error message i dont even know which 
> partition it was.  And judging by the dmesg last modified time, this 
> happened 2 days ago

OK. the question wasn't clear.

1) could you _estimate_ max directory size or number of entries in single
   directory on your filesystems, please? had you large directories?
   100, 300, 500 or more entries?

2) did you use 2.6.0-test7+extents or some another patches?

3) could you describe workload. knowing it I'd try to reproduce this


> Isn't it possible though that this happened in one of the non-extents 
> enabled partitions though?  Since they still have the ability to read 
> extents in files, they have to try and look them up every time for 
> everything dont they?  Anyways, the two partitions above are the only 
> ones i actually enable extents on.
> 

extents take place only if flag in inode->i_flags is set. that flag can
be set only during inode creation on extents-enabled filesystem.

with best wishes, Alex

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-17 20:41           ` Alex Tomas
@ 2003-10-17 21:22             ` Ed Sweetman
  2003-10-17 23:05               ` Alex Tomas
  0 siblings, 1 reply; 9+ messages in thread
From: Ed Sweetman @ 2003-10-17 21:22 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel

Alex Tomas wrote:
> On Fri, 17 Oct 2003 16:13:51 -0400
> Ed Sweetman <ed.sweetman@wmich.edu> wrote:
> 
> 
>>How am i supposed to know which directory in the fs this corruption 
>>takes place in?   I can tell you the size of the partitions that have 
>>extents enabled, From that error message i dont even know which 
>>partition it was.  And judging by the dmesg last modified time, this 
>>happened 2 days ago
> 
> 
> OK. the question wasn't clear.
> 
> 1) could you _estimate_ max directory size or number of entries in single
>    directory on your filesystems, please? had you large directories?
>    100, 300, 500 or more entries?

none of my directories have more than 60 or so entries.  I keep 
everything very organized on my hdds.  The largest directories would be 
the ones holding the largest files but that maxes out at around 60 file 
entries.  i formatted those partitions with a 4KB inode size.


outside of the two partitions with extents enabled though....  I'm not 
sure if i have any seriously large directories in my other partitions. 
And their inode size varies from 1KB to 4Kb depending on what type of 
content they're expected to have .

> 2) did you use 2.6.0-test7+extents or some another patches?

The only other patches i have are related to fbdev and directfb. 
Otherwise it's a vanilla 2.6.0-test7 + extents patch that you posted for it.


> 3) could you describe workload. knowing it I'd try to reproduce this


Workload on those partitions at the time?  It cant be anything more than 
mplayer reading a movie or writing a movie to disk.  And the writes 
would be at about 20MB/sec avg (ext3 to ext3 both with extents) from one 
drive (the partitions happen to be on separate drives) to the other. The 
transferrate spikes at 30MB/sec at start and stays at around 20MB/sec 
for upwards up 1GB for a file.

Nothing else is done on those partitions.  System wise though, what 
caused the crash to occur was updatedb, which does a find on every 
filesystem off of /.  This is what was running when the error occured, 
and it didn't happen this morning when it happened again, the error i 
mean.  I have dma enabled so updatedb doesn't cause significant 
schedular issues due to cpu usage. That's all that was going on at the 
time.


> 
> 
>>Isn't it possible though that this happened in one of the non-extents 
>>enabled partitions though?  Since they still have the ability to read 
>>extents in files, they have to try and look them up every time for 
>>everything dont they?  Anyways, the two partitions above are the only 
>>ones i actually enable extents on.
>>
> 
> 
> extents take place only if flag in inode->i_flags is set. that flag can
> be set only during inode creation on extents-enabled filesystem.
> 
> with best wishes, Alex
> 



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-17 21:22             ` Ed Sweetman
@ 2003-10-17 23:05               ` Alex Tomas
  2003-10-18  0:07                 ` Ed Sweetman
  0 siblings, 1 reply; 9+ messages in thread
From: Alex Tomas @ 2003-10-17 23:05 UTC (permalink / raw)
  To: Ed Sweetman; +Cc: linux-kernel

On Fri, 17 Oct 2003 17:22:05 -0400
Ed Sweetman <ed.sweetman@wmich.edu> wrote:

> none of my directories have more than 60 or so entries.  I keep 
> everything very organized on my hdds.  The largest directories would be 
> the ones holding the largest files but that maxes out at around 60 file 
> entries.  i formatted those partitions with a 4KB inode size.

oh. this seems very confusing for me. extents crashed during readdir() syscall.
4k block may contain upto 60 entries with 60chars length. even if your dir was
larger I don't think it was >16k. so, I really do believe all the extents were
placed in inode body (zero tree depth). also, directory grows in linear manner
only. so, this code patch is very very simple and quite good tested. thus it 
really seems like a corruption, not an error in logic. let me cook a patch that
will show more info.  

also, it's very interesting how is it difficult to reproduce on your box?

thanks!

--
with best regards, Alex

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] EXT3 extents against 2.6.0-test7
  2003-10-17 23:05               ` Alex Tomas
@ 2003-10-18  0:07                 ` Ed Sweetman
  0 siblings, 0 replies; 9+ messages in thread
From: Ed Sweetman @ 2003-10-18  0:07 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel

Alex Tomas wrote:
> On Fri, 17 Oct 2003 17:22:05 -0400
> Ed Sweetman <ed.sweetman@wmich.edu> wrote:
> 
> 
>>none of my directories have more than 60 or so entries.  I keep 
>>everything very organized on my hdds.  The largest directories would be 
>>the ones holding the largest files but that maxes out at around 60 file 
>>entries.  i formatted those partitions with a 4KB inode size.
> 
> 
> oh. this seems very confusing for me. extents crashed during readdir() syscall.
> 4k block may contain upto 60 entries with 60chars length. even if your dir was
> larger I don't think it was >16k. so, I really do believe all the extents were
> placed in inode body (zero tree depth). also, directory grows in linear manner
> only. so, this code patch is very very simple and quite good tested. thus it 
> really seems like a corruption, not an error in logic. let me cook a patch that
> will show more info.  
> 
> also, it's very interesting how is it difficult to reproduce on your box?
> 
> thanks!
> 
> --
> with best regards, Alex
> -

I've been getting dma errors and rtc irq's being missed since moving to 
test7.  I'm really not impressed with it at all.  Whatever happened 
between it and andrew morton's 4th patch to test4 really sucked things 
up with the kernel.  I'm getting zombie processes now that access the 
filesystem (non-extent partitions).  I'm inclined to believe that there 
is corruption, but i really have no idea where it's coming from.  Too 
many things have been patched at the same time to get a feel for which 
one could be causing the problem.  Andrew Morton's branch has been nicer 
to me than linus's for a while so i'm gonna go and patch to that one. 
Even if i wanted to stop using extents,  It would be really hard to 
since i've already started using it and the process is not 
reversable....or is it?


Would i reverse the effect of using extents by copying a file that was 
written when extents were enabled to another location and then back 
again with all partitions mounted with extents not enabled but supported 
by the patch?  Would i be able to fsck the partition without fear of 
fsck completely destroying the partition? and my other ones too for that 
matter.

has there been an ata commit or something?


And it's not so much that it's hard to replicate it's that the processes 
that cause it zombify and dont usually allow the program to be run a 
second time due to it still holding locks.


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2003-10-18  0:07 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-10-13 18:27 [PATCH] EXT3 extents against 2.6.0-test7 Alex Tomas
2003-10-13 21:39 ` Ed Sweetman
     [not found]   ` <20031014212359.42243025.alex@clusterfs.com>
2003-10-17 19:32     ` Ed Sweetman
2003-10-17 20:10       ` Alex Tomas
2003-10-17 20:13         ` Ed Sweetman
2003-10-17 20:41           ` Alex Tomas
2003-10-17 21:22             ` Ed Sweetman
2003-10-17 23:05               ` Alex Tomas
2003-10-18  0:07                 ` Ed Sweetman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).