* [RFC][Patch 1/1] Persistent preallocation in ext4
@ 2006-12-05 13:43 Amit K. Arora
2006-12-06 5:58 ` Amit K. Arora
2006-12-15 12:39 ` [RFC][Patch 2/2] " Amit K. Arora
0 siblings, 2 replies; 26+ messages in thread
From: Amit K. Arora @ 2006-12-05 13:43 UTC (permalink / raw)
To: linux-ext4
This patch implements persistent preallocation using ioctl. Other
options under consideration are posix_fallocate and ftruncate. The code
can be ported to other approach, if ioctl is not the favored one. The
patch takes care of marking/checking uninitialized extents.
Signed-off-by: Amit Arora <aarora@in.ibm.com>
---
fs/ext4/extents.c | 120 ++++++++++++++++++++++++++++++++----------------
fs/ext4/ioctl.c | 41 ++++++++++++++++
include/linux/ext4_fs.h | 21 ++++++++
3 files changed, 144 insertions(+), 38 deletions(-)
Index: linux-2.6.19/fs/ext4/extents.c
===================================================================
--- linux-2.6.19.orig/fs/ext4/extents.c 2006-12-05 16:03:40.000000000 +0530
+++ linux-2.6.19/fs/ext4/extents.c 2006-12-05 16:03:53.000000000 +0530
@@ -282,7 +282,7 @@
} else if (path->p_ext) {
ext_debug(" %d:%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- le16_to_cpu(path->p_ext->ee_len),
+ ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
ext_debug(" []");
@@ -305,7 +305,7 @@
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
}
@@ -425,7 +425,7 @@
ext_debug(" -> %d:%llu:%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
- le16_to_cpu(path->p_ext->ee_len));
+ ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
{
@@ -686,7 +686,7 @@
ext_debug("move %d:%llu:%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
- le16_to_cpu(path[depth].p_ext->ee_len),
+ ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
@@ -1097,7 +1097,23 @@
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+ unsigned short ext1_ee_len, ext2_ee_len;
+
+
+ /*
+ * Make sure that either both extents are uninitialized, or
+ * both are _not_.
+ */
+ if ((ext4_ext_is_uninitialized(ex1)
+ && !ext4_ext_is_uninitialized(ex2))
+ || (!ext4_ext_is_uninitialized(ex1)
+ && ext4_ext_is_uninitialized(ex2)))
+ return 0;
+
+ ext1_ee_len = ext4_ext_get_actual_len(ex1);
+ ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+ if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
le32_to_cpu(ex2->ee_block))
return 0;
@@ -1106,14 +1122,14 @@
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+ if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
return 0;
#ifdef AGRESSIVE_TEST
if (le16_to_cpu(ex1->ee_len) >= 4)
return 0;
#endif
- if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+ if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
return 1;
return 0;
}
@@ -1132,9 +1148,9 @@
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
- int depth, len, err, next;
+ int depth, len, err, next, uninitialized = 0;
- BUG_ON(newext->ee_len == 0);
+ BUG_ON(ext4_ext_get_actual_len(newext) == 0);
depth = ext_depth(inode);
ex = path[depth].p_ext;
BUG_ON(path[depth].p_hdr == NULL);
@@ -1142,13 +1158,22 @@
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append %d block to %d:%d (from %llu)\n",
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
if ((err = ext4_ext_get_access(handle, inode, path + depth)))
return err;
- ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
- + le16_to_cpu(newext->ee_len));
+
+ /* ext4_can_extents_be_merged should have checked that either
+ * both extents are uninitialized, or both aren't. Thus we
+ * need to check any of them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1203,7 +1228,7 @@
ext_debug("first extent in the leaf: %d:%llu:%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len));
+ ext4_mark_uninitialized_ext(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
@@ -1216,7 +1241,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_mark_uninitialized_ext(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
}
@@ -1229,7 +1254,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
path[depth].p_ext = nearex;
@@ -1248,8 +1273,13 @@
if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
break;
/* merge with next extent! */
- nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
- + le16_to_cpu(nearex[1].ee_len));
+ if (ext4_ext_is_uninitialized(nearex))
+ uninitialized = 1;
+ nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
+ + ext4_ext_get_actual_len(nearex + 1));
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(nearex);
+
if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - nearex - 1)
* sizeof(struct ext4_extent);
@@ -1319,8 +1349,8 @@
end = le32_to_cpu(ex->ee_block);
if (block + num < end)
end = block + num;
- } else if (block >=
- le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
/* need to allocate space after found extent */
start = block;
end = block + num;
@@ -1332,7 +1362,8 @@
* by found extent
*/
start = block;
- end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
if (block + num < end)
end = block + num;
exists = 1;
@@ -1348,7 +1379,7 @@
cbex.ec_type = EXT4_EXT_CACHE_GAP;
} else {
cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = le16_to_cpu(ex->ee_len);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
cbex.ec_start = ext_pblock(ex);
cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
}
@@ -1556,12 +1587,12 @@
unsigned long from, unsigned long to)
{
struct buffer_head *bh;
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
@@ -1575,12 +1606,12 @@
}
#endif
if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
unsigned long num;
ext4_fsblk_t start;
- num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
- start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %lu blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1588,12 +1619,12 @@
}
ext4_free_blocks(handle, inode, start, num);
} else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
} else {
printk("strange request: removal(2) %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
}
return 0;
}
@@ -1607,7 +1638,7 @@
struct ext4_extent_header *eh;
unsigned a, b, block, num;
unsigned long ex_ee_block;
- unsigned short ex_ee_len;
+ unsigned short ex_ee_len, uninitialized = 0;
struct ext4_extent *ex;
ext_debug("truncate since %lu in leaf\n", start);
@@ -1622,7 +1653,9 @@
ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ if(ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex_ee_len = ext4_ext_get_actual_len(ex);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -1690,6 +1723,8 @@
ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(ex);
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -1699,7 +1734,7 @@
ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
}
if (correct_index && eh->eh_entries)
@@ -1974,7 +2009,7 @@
if ((ex = path[depth].p_ext)) {
unsigned long ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext_pblock(ex);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
+ unsigned short ee_len;
/*
* Allow future support for preallocated extents to be added
@@ -1982,8 +2017,11 @@
* Uninitialized extents are treated as holes, except that
* we avoid (fail) allocating new blocks during a write.
*/
- if (ee_len > EXT_MAX_LEN)
+ if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN
+ && create != EXT4_CREATE_UNINITIALIZED_EXT)
goto out2;
+
+ ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
if (iblock >= ee_block && iblock < ee_block + ee_len) {
newblock = iblock - ee_block + ee_start;
@@ -1991,8 +2029,10 @@
allocated = ee_len - (iblock - ee_block);
ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
ee_block, ee_len, newblock);
- ext4_ext_put_in_cache(inode, ee_block, ee_len,
- ee_start, EXT4_EXT_CACHE_EXTENT);
+ /* Do not put uninitialized extent in the cache */
+ if(!ext4_ext_is_uninitialized(ex))
+ ext4_ext_put_in_cache(inode, ee_block, ee_len,
+ ee_start, EXT4_EXT_CACHE_EXTENT);
goto out;
}
}
@@ -2027,6 +2067,8 @@
newex.ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
+ if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
+ ext4_mark_uninitialized_ext(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err)
goto out2;
@@ -2038,8 +2080,10 @@
newblock = ext_pblock(&newex);
__set_bit(BH_New, &bh_result->b_state);
- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ /* Cache only when it is _not_ an uninitialized extent */
+ if(create!=EXT4_CREATE_UNINITIALIZED_EXT)
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
out:
if (allocated > max_blocks)
allocated = max_blocks;
Index: linux-2.6.19/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.19.orig/fs/ext4/ioctl.c 2006-12-05 16:03:40.000000000 +0530
+++ linux-2.6.19/fs/ext4/ioctl.c 2006-12-05 16:03:43.000000000 +0530
@@ -248,6 +248,47 @@
return err;
}
+ case EXT4_IOC_PREALLOCATE: {
+ struct ext4_falloc_input input;
+ handle_t *handle;
+ ext4_fsblk_t block, max_blocks;
+ int ret = 0;
+ struct buffer_head map_bh;
+ unsigned int blkbits = inode->i_blkbits;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (copy_from_user(&input,
+ (struct ext4_falloc_input __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ if (input.len == 0)
+ return -EINVAL;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -ENOTTY;
+
+ block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits;
+ max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits;
+ handle=ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ while(ret>=0 && ret<max_blocks)
+ {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ ret = ext4_ext_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_CREATE_UNINITIALIZED_EXT, 1);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return ret>0?0:ret;
+ }
+
default:
return -ENOTTY;
}
Index: linux-2.6.19/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.19.orig/include/linux/ext4_fs.h 2006-12-05 16:03:40.000000000 +0530
+++ linux-2.6.19/include/linux/ext4_fs.h 2006-12-05 16:03:53.000000000 +0530
@@ -102,6 +102,8 @@
EXT4_GOOD_OLD_FIRST_INO : \
(s)->s_first_ino)
#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \
+ (~((1 << blkbits)-1)))
/*
* Macro-instructions used to manage fragments
@@ -118,6 +120,19 @@
#endif
/*
+ * Macro-instructions used to handle (mark/unmark/check/create) unitialized
+ * extents. Applications can issue an IOCTL for preallocation, which results
+ * in assigning unitialized extents to the file
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT 2
+#define ext4_mark_uninitialized_ext(ext) ((ext)->ee_len |= \
+ cpu_to_le16(0x8000))
+#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x8000)
+#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x7FFF)
+
+/*
* Structure of a blocks group descriptor
*/
struct ext4_group_desc
@@ -225,6 +240,11 @@
__u32 free_blocks_count;
};
+/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */
+struct ext4_falloc_input {
+ __u64 offset;
+ __u64 len;
+};
/*
* ioctl commands
@@ -242,6 +262,7 @@
#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input)
/*
* ioctl commands in 32 bit emulation
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-05 13:43 [RFC][Patch 1/1] Persistent preallocation in ext4 Amit K. Arora @ 2006-12-06 5:58 ` Amit K. Arora 2006-12-12 1:28 ` Mingming Cao 2006-12-15 12:35 ` [RFC][Patch 1/2] " Amit K. Arora 2006-12-15 12:39 ` [RFC][Patch 2/2] " Amit K. Arora 1 sibling, 2 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-06 5:58 UTC (permalink / raw) To: linux-ext4; +Cc: suparna, cmm, suzuki This patch takes care of few suggestions given by Suzuki. Suzuki, Thanks! Signed-off-by: Amit Arora <aarora@in.ibm.com> --- fs/ext4/extents.c | 117 ++++++++++++++++++++++++++++++++---------------- fs/ext4/ioctl.c | 41 ++++++++++++++++ include/linux/ext4_fs.h | 21 ++++++++ 3 files changed, 141 insertions(+), 38 deletions(-) Index: linux-2.6.19/fs/ext4/extents.c =================================================================== --- linux-2.6.19.orig/fs/ext4/extents.c 2006-12-06 10:18:12.000000000 +0530 +++ linux-2.6.19/fs/ext4/extents.c 2006-12-06 10:23:02.000000000 +0530 @@ -282,7 +282,7 @@ } else if (path->p_ext) { ext_debug(" %d:%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - le16_to_cpu(path->p_ext->ee_len), + ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else ext_debug(" []"); @@ -305,7 +305,7 @@ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); } @@ -425,7 +425,7 @@ ext_debug(" -> %d:%llu:%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), - le16_to_cpu(path->p_ext->ee_len)); + ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { @@ -686,7 +686,7 @@ ext_debug("move %d:%llu:%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), - le16_to_cpu(path[depth].p_ext->ee_len), + ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, sizeof(struct ext4_extent)); @@ -1097,7 +1097,20 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != + unsigned short ext1_ee_len, ext2_ee_len; + + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; @@ -1106,14 +1119,14 @@ * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) + if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN) return 0; #ifdef AGRESSIVE_TEST if (le16_to_cpu(ex1->ee_len) >= 4) return 0; #endif - if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2)) + if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) return 1; return 0; } @@ -1132,9 +1145,9 @@ struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; - int depth, len, err, next; + int depth, len, err, next, uninitialized = 0; - BUG_ON(newext->ee_len == 0); + BUG_ON(ext4_ext_get_actual_len(newext) == 0); depth = ext_depth(inode); ex = path[depth].p_ext; BUG_ON(path[depth].p_hdr == NULL); @@ -1142,13 +1155,22 @@ /* try to insert block into found extent and return */ if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append %d block to %d:%d (from %llu)\n", - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); if ((err = ext4_ext_get_access(handle, inode, path + depth))) return err; - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) - + le16_to_cpu(newext->ee_len)); + + /* ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check any of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if(uninitialized) + ext4_mark_uninitialized_ext(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1203,7 +1225,7 @@ ext_debug("first extent in the leaf: %d:%llu:%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len)); + ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { @@ -1216,7 +1238,7 @@ "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); } @@ -1229,7 +1251,7 @@ "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; @@ -1248,8 +1270,13 @@ if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1)) break; /* merge with next extent! */ - nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len) - + le16_to_cpu(nearex[1].ee_len)); + if (ext4_ext_is_uninitialized(nearex)) + uninitialized = 1; + nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex) + + ext4_ext_get_actual_len(nearex + 1)); + if(uninitialized) + ext4_mark_uninitialized_ext(nearex); + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - nearex - 1) * sizeof(struct ext4_extent); @@ -1319,8 +1346,8 @@ end = le32_to_cpu(ex->ee_block); if (block + num < end) end = block + num; - } else if (block >= - le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { /* need to allocate space after found extent */ start = block; end = block + num; @@ -1332,7 +1359,8 @@ * by found extent */ start = block; - end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); if (block + num < end) end = block + num; exists = 1; @@ -1348,7 +1376,7 @@ cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = le16_to_cpu(ex->ee_len); + cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext_pblock(ex); cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } @@ -1556,12 +1584,12 @@ unsigned long from, unsigned long to) { struct buffer_head *bh; + unsigned short ee_len = ext4_ext_get_actual_len(ex); int i; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = le16_to_cpu(ex->ee_len); spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; @@ -1575,12 +1603,12 @@ } #endif if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ unsigned long num; ext4_fsblk_t start; - num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; - start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num; + num = le32_to_cpu(ex->ee_block) + ee_len - from; + start = ext_pblock(ex) + ee_len - num; ext_debug("free last %lu blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); @@ -1588,12 +1616,12 @@ } ext4_free_blocks(handle, inode, start, num); } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } else { printk("strange request: removal(2) %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } return 0; } @@ -1607,7 +1635,7 @@ struct ext4_extent_header *eh; unsigned a, b, block, num; unsigned long ex_ee_block; - unsigned short ex_ee_len; + unsigned short ex_ee_len, uninitialized = 0; struct ext4_extent *ex; ext_debug("truncate since %lu in leaf\n", start); @@ -1622,7 +1650,9 @@ ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + if(ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex_ee_len = ext4_ext_get_actual_len(ex); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -1690,6 +1720,8 @@ ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); + if(uninitialized) + ext4_mark_uninitialized_ext(ex); err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -1699,7 +1731,7 @@ ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) @@ -1974,7 +2006,7 @@ if ((ex = path[depth].p_ext)) { unsigned long ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext_pblock(ex); - unsigned short ee_len = le16_to_cpu(ex->ee_len); + unsigned short ee_len; /* * Allow future support for preallocated extents to be added @@ -1982,8 +2014,11 @@ * Uninitialized extents are treated as holes, except that * we avoid (fail) allocating new blocks during a write. */ - if (ee_len > EXT_MAX_LEN) + if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN + && create != EXT4_CREATE_UNINITIALIZED_EXT) goto out2; + + ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (iblock >= ee_block && iblock < ee_block + ee_len) { newblock = iblock - ee_block + ee_start; @@ -1991,8 +2026,10 @@ allocated = ee_len - (iblock - ee_block); ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, ee_block, ee_len, newblock); - ext4_ext_put_in_cache(inode, ee_block, ee_len, - ee_start, EXT4_EXT_CACHE_EXTENT); + /* Do not put uninitialized extent in the cache */ + if(!ext4_ext_is_uninitialized(ex)) + ext4_ext_put_in_cache(inode, ee_block, ee_len, + ee_start, EXT4_EXT_CACHE_EXTENT); goto out; } } @@ -2027,6 +2064,8 @@ newex.ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(allocated); + if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + ext4_mark_uninitialized_ext(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) goto out2; @@ -2038,8 +2077,10 @@ newblock = ext_pblock(&newex); __set_bit(BH_New, &bh_result->b_state); - ext4_ext_put_in_cache(inode, iblock, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + /* Cache only when it is _not_ an uninitialized extent */ + if(create!=EXT4_CREATE_UNINITIALIZED_EXT) + ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT4_EXT_CACHE_EXTENT); out: if (allocated > max_blocks) allocated = max_blocks; Index: linux-2.6.19/fs/ext4/ioctl.c =================================================================== --- linux-2.6.19.orig/fs/ext4/ioctl.c 2006-12-06 10:18:12.000000000 +0530 +++ linux-2.6.19/fs/ext4/ioctl.c 2006-12-06 10:18:15.000000000 +0530 @@ -248,6 +248,47 @@ return err; } + case EXT4_IOC_PREALLOCATE: { + struct ext4_falloc_input input; + handle_t *handle; + ext4_fsblk_t block, max_blocks; + int ret = 0; + struct buffer_head map_bh; + unsigned int blkbits = inode->i_blkbits; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (copy_from_user(&input, + (struct ext4_falloc_input __user *) arg, sizeof(input))) + return -EFAULT; + + if (input.len == 0) + return -EINVAL; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -ENOTTY; + + block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits; + max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits; + handle=ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks); + if (IS_ERR(handle)) + return PTR_ERR(handle); + while(ret>=0 && ret<max_blocks) + { + block = block + ret; + max_blocks = max_blocks - ret; + ret = ext4_ext_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 1); + } + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return ret>0?0:ret; + } + default: return -ENOTTY; } Index: linux-2.6.19/include/linux/ext4_fs.h =================================================================== --- linux-2.6.19.orig/include/linux/ext4_fs.h 2006-12-06 10:18:12.000000000 +0530 +++ linux-2.6.19/include/linux/ext4_fs.h 2006-12-06 10:18:16.000000000 +0530 @@ -102,6 +102,8 @@ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif +#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \ + (~((1 << blkbits)-1))) /* * Macro-instructions used to manage fragments @@ -118,6 +120,19 @@ #endif /* + * Macro-instructions used to handle (mark/unmark/check/create) unitialized + * extents. Applications can issue an IOCTL for preallocation, which results + * in assigning unitialized extents to the file + */ +#define EXT4_CREATE_UNINITIALIZED_EXT 2 +#define ext4_mark_uninitialized_ext(ext) ((ext)->ee_len |= \ + cpu_to_le16(0x8000)) +#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \ + 0x8000) +#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \ + 0x7FFF) + +/* * Structure of a blocks group descriptor */ struct ext4_group_desc @@ -225,6 +240,11 @@ __u32 free_blocks_count; }; +/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */ +struct ext4_falloc_input { + __u64 offset; + __u64 len; +}; /* * ioctl commands @@ -242,6 +262,7 @@ #endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input) /* * ioctl commands in 32 bit emulation ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-06 5:58 ` Amit K. Arora @ 2006-12-12 1:28 ` Mingming Cao 2006-12-12 6:23 ` Amit K. Arora 2006-12-15 12:35 ` [RFC][Patch 1/2] " Amit K. Arora 1 sibling, 1 reply; 26+ messages in thread From: Mingming Cao @ 2006-12-12 1:28 UTC (permalink / raw) To: Amit K. Arora; +Cc: linux-ext4, suparna, suzuki On Wed, 2006-12-06 at 11:28 +0530, Amit K. Arora wrote: > @@ -1142,13 +1155,22 @@ > /* try to insert block into found extent and return */ > if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { > ext_debug("append %d block to %d:%d (from %llu)\n", > - le16_to_cpu(newext->ee_len), > + ext4_ext_get_actual_len(newext), > le32_to_cpu(ex->ee_block), > - le16_to_cpu(ex->ee_len), ext_pblock(ex)); > + ext4_ext_get_actual_len(ex), ext_pblock(ex)); > if ((err = ext4_ext_get_access(handle, inode, path + depth))) > return err; > - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) > - + le16_to_cpu(newext->ee_len)); > + > + /* ext4_can_extents_be_merged should have checked that either > + * both extents are uninitialized, or both aren't. Thus we > + * need to check any of them here. > + */ > + if (ext4_ext_is_uninitialized(ex)) > + uninitialized = 1; > + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) > + + ext4_ext_get_actual_len(newext)); > + if(uninitialized) > + ext4_mark_uninitialized_ext(ex); > eh = path[depth].p_hdr; > nearex = ex; > goto merge; Hmm, I missed the point to re-mark an uninitialized extent here. If ex is an uninitialized extent, the mark(the first bit the ee_len) shall still there after the update, isn't? We already make sure that two large uninitialized extent can't get merged if the resulting length will take the first bit, which used as the mark of uninitialized extent. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-12 1:28 ` Mingming Cao @ 2006-12-12 6:23 ` Amit K. Arora 2006-12-13 0:20 ` Mingming Cao 0 siblings, 1 reply; 26+ messages in thread From: Amit K. Arora @ 2006-12-12 6:23 UTC (permalink / raw) To: Mingming Cao; +Cc: linux-ext4, suparna, suzuki Hi Mingming, On Mon, Dec 11, 2006 at 05:28:15PM -0800, Mingming Cao wrote: > On Wed, 2006-12-06 at 11:28 +0530, Amit K. Arora wrote: > > > @@ -1142,13 +1155,22 @@ > > /* try to insert block into found extent and return */ > > if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { > > ext_debug("append %d block to %d:%d (from %llu)\n", > > - le16_to_cpu(newext->ee_len), > > + ext4_ext_get_actual_len(newext), > > le32_to_cpu(ex->ee_block), > > - le16_to_cpu(ex->ee_len), ext_pblock(ex)); > > + ext4_ext_get_actual_len(ex), ext_pblock(ex)); > > if ((err = ext4_ext_get_access(handle, inode, path + depth))) > > return err; > > - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) > > - + le16_to_cpu(newext->ee_len)); > > + > > + /* ext4_can_extents_be_merged should have checked that either > > + * both extents are uninitialized, or both aren't. Thus we > > + * need to check any of them here. > > + */ > > + if (ext4_ext_is_uninitialized(ex)) > > + uninitialized = 1; > > + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) > > + + ext4_ext_get_actual_len(newext)); Above line will "remove" the uninitialized bit from "ex", if it was set. We call ext4_ext_get_actual_len() to get the "actual" lengths of the two extents, which removes this MSB in ee_len (MSB in ee_len is used to mark an extent uninitialized). Now, we do this because if lengths of two uninitialized extents will be added as it is (i.e. without masking out the MSB in the length), it will result in removing the MSB in ee_len. For example, 0x8002 + 0x8003 => 0x10005 => 0x5 (since ee_len is 16 bit). That is why just before this line, we save the "state" of this extent, whether it was uninitialized or not. And, we restore this "state" below. > > + if(uninitialized) > > + ext4_mark_uninitialized_ext(ex); > > eh = path[depth].p_hdr; > > nearex = ex; > > goto merge; > > Hmm, I missed the point to re-mark an uninitialized extent here. If ex > is an uninitialized extent, the mark(the first bit the ee_len) shall > still there after the update, isn't? We already make sure that two > large uninitialized extent can't get merged if the resulting length will > take the first bit, which used as the mark of uninitialized extent. Please get back if you do not agree with the explanation above and if I am missing something here. Thanks! Regards, Amit Arora ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-12 6:23 ` Amit K. Arora @ 2006-12-13 0:20 ` Mingming Cao 2006-12-13 10:01 ` Amit K. Arora 0 siblings, 1 reply; 26+ messages in thread From: Mingming Cao @ 2006-12-13 0:20 UTC (permalink / raw) To: Amit K. Arora; +Cc: linux-ext4, suparna, suzuki On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote: > Hi Mingming, > Hi Amit, > On Mon, Dec 11, 2006 at 05:28:15PM -0800, Mingming Cao wrote: > > On Wed, 2006-12-06 at 11:28 +0530, Amit K. Arora wrote: > > > > > @@ -1142,13 +1155,22 @@ > > > /* try to insert block into found extent and return */ > > > if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { > > > ext_debug("append %d block to %d:%d (from %llu)\n", > > > - le16_to_cpu(newext->ee_len), > > > + ext4_ext_get_actual_len(newext), > > > le32_to_cpu(ex->ee_block), > > > - le16_to_cpu(ex->ee_len), ext_pblock(ex)); > > > + ext4_ext_get_actual_len(ex), ext_pblock(ex)); > > > if ((err = ext4_ext_get_access(handle, inode, path + depth))) > > > return err; > > > - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) > > > - + le16_to_cpu(newext->ee_len)); > > > + > > > + /* ext4_can_extents_be_merged should have checked that either > > > + * both extents are uninitialized, or both aren't. Thus we > > > + * need to check any of them here. > > > + */ > > > + if (ext4_ext_is_uninitialized(ex)) > > > + uninitialized = 1; > > > + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) > > > + + ext4_ext_get_actual_len(newext)); > > Above line will "remove" the uninitialized bit from "ex", if it was set. > We call ext4_ext_get_actual_len() to get the "actual" lengths of the two > extents, which removes this MSB in ee_len (MSB in ee_len is used to mark > an extent uninitialized). Now, we do this because if lengths of two > uninitialized extents will be added as it is (i.e. without masking out > the MSB in the length), it will result in removing the MSB in ee_len. > For example, 0x8002 + 0x8003 => 0x10005 => 0x5 (since ee_len is 16 bit). > > That is why just before this line, we save the "state" of this extent, > whether it was uninitialized or not. And, we restore this "state" below. Ah...you are right:) More comments below... > Index: linux-2.6.19/fs/ext4/ioctl.c > =================================================================== > --- linux-2.6.19.orig/fs/ext4/ioctl.c 2006-12-06 10:18:12.000000000 +0530 > +++ linux-2.6.19/fs/ext4/ioctl.c 2006-12-06 10:18:15.000000000 +0530 > @@ -248,6 +248,47 @@ > return err; > } > > + case EXT4_IOC_PREALLOCATE: { > + struct ext4_falloc_input input; > + handle_t *handle; > + ext4_fsblk_t block, max_blocks; > + int ret = 0; > + struct buffer_head map_bh; > + unsigned int blkbits = inode->i_blkbits; > + > + if (IS_RDONLY(inode)) > + return -EROFS; > + > + if (copy_from_user(&input, > + (struct ext4_falloc_input __user *) arg, sizeof(input))) > + return -EFAULT; > + > + if (input.len == 0) > + return -EINVAL; > + > + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) > + return -ENOTTY; > Supporting preallocation for extent based files seems fairly straightforward. I agree we should look at this first. After get this done, it probably worth re-consider whether to support preallocation for non-extent based files on ext4. I could imagine user upgrade from ext3 to ext4, and expecting to use preallocation on those existing files.... > + > + block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits; > + max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits; > + handle=ext4_journal_start(inode, > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks); > + if (IS_ERR(handle)) > + return PTR_ERR(handle); > + while(ret>=0 && ret<max_blocks) > + { > + block = block + ret; > + max_blocks = max_blocks - ret; > + ret = ext4_ext_get_blocks(handle, inode, block, > + max_blocks, &map_bh, > + EXT4_CREATE_UNINITIALIZED_EXT, 1); Since the interface takes offset and number of blocks to allocate, I assuming we are going to handle holes in preallocation, thus, we cannot not mark the extend_size flag to 1 when calling ext4_ext_get_blocks. I think we should update i_size and i_disksize after preallocation. Oh, to protect parallel updating i_size, we have to take i_mutex down. > + } > + ext4_mark_inode_dirty(handle, inode); > + ext4_journal_stop(handle); > + Error code returned by ext4_journal_stop() is being ignored here, is this right? Well, there are other places in ext34/ioctl.c which ignore the return returned by ext4_journal_stop(), maybe should fix this in a separate patch. > + return ret>0?0:ret; > + } > > Oh, what if we failed to allocate the full amount of blocks? i.e, the ext4_ext_get_blocks() returns -ENOSPC error and exit the loop early. Are we going to return error, or try something like if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry I wonder it might be useful to return the actual number of blocks preallocated back to the application. Mingming ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-13 0:20 ` Mingming Cao @ 2006-12-13 10:01 ` Amit K. Arora 2006-12-13 13:36 ` Dave Kleikamp 0 siblings, 1 reply; 26+ messages in thread From: Amit K. Arora @ 2006-12-13 10:01 UTC (permalink / raw) To: Mingming Cao; +Cc: linux-ext4, suparna, suzuki On Tue, Dec 12, 2006 at 04:20:38PM -0800, Mingming Cao wrote: > On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote: > > + > > + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) > > + return -ENOTTY; > > > > Supporting preallocation for extent based files seems fairly > straightforward. I agree we should look at this first. After get this > done, it probably worth re-consider whether to support preallocation for > non-extent based files on ext4. I could imagine user upgrade from ext3 > to ext4, and expecting to use preallocation on those existing files.... I gave a thought on this initially. But, I was not sure how we should implement preallocation in a non-extent based file. Using extents we can mark a set of blocks as unitialized, but how will we do this for non-extent based files ? If we do not have a way to mark blocks uninitialized, when someone will try to read from a preallocated block, it will return junk/stale data instead of zeroes. But, if we can think of a solution here then it will be as simple as removing the check above and replacing "ext4_ext_get_blocks()" with "ext4_get_blocks_wrap()" in the while() loop. > > > + > > + block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits; > > + max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits; I was wondering if I should change above lines to this : + block = input.offset >> blkbits; + max_blocks = (EXT4_BLOCK_ALIGN(input.len+input.offset, blkbits) >> blkbits) - block; Reason is that the block which contains the offset, should also be preallocated. And the max_blocks should be calculated accordingly. > > + while(ret>=0 && ret<max_blocks) > > + { > > + block = block + ret; > > + max_blocks = max_blocks - ret; > > + ret = ext4_ext_get_blocks(handle, inode, block, > > + max_blocks, &map_bh, > > + EXT4_CREATE_UNINITIALIZED_EXT, 1); > > Since the interface takes offset and number of blocks to allocate, I > assuming we are going to handle holes in preallocation, thus, we cannot > not mark the extend_size flag to 1 when calling ext4_ext_get_blocks. > > I think we should update i_size and i_disksize after preallocation. Oh, > to protect parallel updating i_size, we have to take i_mutex down. Okay. So, is this what you want to be done here : +retry: + ret = 0; + while(ret>=0 && ret<max_blocks) + { + block = block + ret; + max_blocks = max_blocks - ret; + ret = ext4_ext_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT,0); + if(ret > 0 && test_bit(BH_New, &map_bh.b_state)) + nblocks = nblocks + ret; + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, + &retries)) + goto retry; + + if(nblocks) { + mutex_lock(&inode->i_mutex); + inode->i_size = inode->i_size + (nblocks >> blkbits); + EXT4_I(inode)->i_disksize = inode->i_size; + mutex_unlock(&inode->i_mutex); + } > > > + } > > + ext4_mark_inode_dirty(handle, inode); > > + ext4_journal_stop(handle); > > + > > Error code returned by ext4_journal_stop() is being ignored here, is > this right? > Well, there are other places in ext34/ioctl.c which ignore the return > returned by ext4_journal_stop(), maybe should fix this in a separate > patch. Agreed. I think following should take care of it: + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if(ret > 0) + ret = ret2; + return ret > 0 ? nblocks : ret; > > + return ret>0?0:ret; > > + } > > > > > Oh, what if we failed to allocate the full amount of blocks? i.e, the > ext4_ext_get_blocks() returns -ENOSPC error and exit the loop early. Are > we going to return error, or try something like > > if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) > goto retry > > I wonder it might be useful to return the actual number of blocks > preallocated back to the application. Ok. Yes, makes sense. We can return the number of "new" blocks like this: + return ret > 0 ? nblocks : ret; Please let me know if you agree with the above set of changes, and any further comments you have. I will then update and test the new patch and post it again. Thanks! Regards, Amit Arora ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-13 10:01 ` Amit K. Arora @ 2006-12-13 13:36 ` Dave Kleikamp 2006-12-13 15:38 ` Suparna Bhattacharya 2006-12-13 15:54 ` Mingming Cao 0 siblings, 2 replies; 26+ messages in thread From: Dave Kleikamp @ 2006-12-13 13:36 UTC (permalink / raw) To: Amit K. Arora; +Cc: Mingming Cao, linux-ext4, suparna, suzuki On Wed, 2006-12-13 at 15:31 +0530, Amit K. Arora wrote: > On Tue, Dec 12, 2006 at 04:20:38PM -0800, Mingming Cao wrote: > > On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote: > > Supporting preallocation for extent based files seems fairly > > straightforward. I agree we should look at this first. After get this > > done, it probably worth re-consider whether to support preallocation for > > non-extent based files on ext4. I could imagine user upgrade from ext3 > > to ext4, and expecting to use preallocation on those existing files.... I disagree here. Why add the complexity for what is going to be a rare case? In cases where a user is going to benefit from preallocation, she'll probably also benefit from extents, and would be better off making a copy of the file, thus converting it to extents. > I gave a thought on this initially. But, I was not sure how we should > implement preallocation in a non-extent based file. Using extents we can > mark a set of blocks as unitialized, but how will we do this for > non-extent based files ? If we do not have a way to mark blocks > uninitialized, when someone will try to read from a preallocated block, > it will return junk/stale data instead of zeroes. If anything, the block-based preallocation could initialize all of the data to zero. It would be slow, but it would still provide the correct function and result in contiguous allocation. Shaggy -- David Kleikamp IBM Linux Technology Center ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-13 13:36 ` Dave Kleikamp @ 2006-12-13 15:38 ` Suparna Bhattacharya 2006-12-13 15:54 ` Mingming Cao 1 sibling, 0 replies; 26+ messages in thread From: Suparna Bhattacharya @ 2006-12-13 15:38 UTC (permalink / raw) To: Dave Kleikamp; +Cc: Amit K. Arora, Mingming Cao, linux-ext4, suzuki On Wed, Dec 13, 2006 at 07:36:29AM -0600, Dave Kleikamp wrote: > On Wed, 2006-12-13 at 15:31 +0530, Amit K. Arora wrote: > > On Tue, Dec 12, 2006 at 04:20:38PM -0800, Mingming Cao wrote: > > > On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote: > > > > Supporting preallocation for extent based files seems fairly > > > straightforward. I agree we should look at this first. After get this > > > done, it probably worth re-consider whether to support preallocation for > > > non-extent based files on ext4. I could imagine user upgrade from ext3 > > > to ext4, and expecting to use preallocation on those existing files.... > > I disagree here. Why add the complexity for what is going to be a rare > case? In cases where a user is going to benefit from preallocation, > she'll probably also benefit from extents, and would be better off > making a copy of the file, thus converting it to extents. > > > I gave a thought on this initially. But, I was not sure how we should > > implement preallocation in a non-extent based file. Using extents we can > > mark a set of blocks as unitialized, but how will we do this for > > non-extent based files ? If we do not have a way to mark blocks > > uninitialized, when someone will try to read from a preallocated block, > > it will return junk/stale data instead of zeroes. > > If anything, the block-based preallocation could initialize all of the > data to zero. It would be slow, but it would still provide the correct > function and result in contiguous allocation. And posix_fallocate does that already ... Regards Suparna > > Shaggy > -- > David Kleikamp > IBM Linux Technology Center > > - > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Suparna Bhattacharya (suparna@in.ibm.com) Linux Technology Center IBM Software Lab, India ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/1] Persistent preallocation in ext4 2006-12-13 13:36 ` Dave Kleikamp 2006-12-13 15:38 ` Suparna Bhattacharya @ 2006-12-13 15:54 ` Mingming Cao 1 sibling, 0 replies; 26+ messages in thread From: Mingming Cao @ 2006-12-13 15:54 UTC (permalink / raw) To: Dave Kleikamp; +Cc: Amit K. Arora, linux-ext4, suparna, suzuki Dave Kleikamp wrote: > > If anything, the block-based preallocation could initialize all of the > data to zero. It would be slow, but it would still provide the correct > function and result in contiguous allocation. > Agree. That seems goood enough. Mingming ^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFC][Patch 1/2] Persistent preallocation in ext4 2006-12-06 5:58 ` Amit K. Arora 2006-12-12 1:28 ` Mingming Cao @ 2006-12-15 12:35 ` Amit K. Arora 2006-12-19 11:05 ` Amit K. Arora 2006-12-27 23:30 ` Mingming Cao 1 sibling, 2 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-15 12:35 UTC (permalink / raw) To: linux-ext4; +Cc: suparna, cmm, suzuki, alex This is the first patch in the set of two. It implements the ioctl which will be used for persistent preallocation. It is a respun of the previous patch which was posted earlier, and includes following changes: * Takes care of review comments by Mingming * The declaration of extent related macros are now moved to ext4_fs_extent.h (from ext4_fs.h) * Updated the logic to calculate block and max_blocks in ext4/ioctl.c, which is used to call get_blocks. It does _not_ take care of implementing persistent preallocation for non-extent based files. It is because of the following reasons: * It is being considered as a rare case * Users can/should convert their file(s) to extent format to use this feature * Moreover, posix_fallocate() can be used for this purpose, if the user does not want to convert the file(s) to the extent based format. Signed-off-by: Amit Arora (aarora@in.ibm.com) --- fs/ext4/extents.c | 116 ++++++++++++++++++++++++++-------------- fs/ext4/ioctl.c | 59 ++++++++++++++++++++ include/linux/ext4_fs.h | 13 ++++ include/linux/ext4_fs_extents.h | 13 ++++ 4 files changed, 163 insertions(+), 38 deletions(-) Index: linux-2.6.19.prealloc/fs/ext4/extents.c =================================================================== --- linux-2.6.19.prealloc.orig/fs/ext4/extents.c 2006-12-15 16:44:35.000000000 +0530 +++ linux-2.6.19.prealloc/fs/ext4/extents.c 2006-12-15 17:53:48.000000000 +0530 @@ -282,7 +282,7 @@ } else if (path->p_ext) { ext_debug(" %d:%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - le16_to_cpu(path->p_ext->ee_len), + ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else ext_debug(" []"); @@ -305,7 +305,7 @@ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); } @@ -425,7 +425,7 @@ ext_debug(" -> %d:%llu:%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), - le16_to_cpu(path->p_ext->ee_len)); + ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { @@ -686,7 +686,7 @@ ext_debug("move %d:%llu:%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), - le16_to_cpu(path[depth].p_ext->ee_len), + ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, sizeof(struct ext4_extent)); @@ -1097,7 +1097,19 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != + unsigned short ext1_ee_len, ext2_ee_len; + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; @@ -1106,14 +1118,14 @@ * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) + if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN) return 0; #ifdef AGRESSIVE_TEST if (le16_to_cpu(ex1->ee_len) >= 4) return 0; #endif - if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2)) + if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) return 1; return 0; } @@ -1132,9 +1144,9 @@ struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; - int depth, len, err, next; + int depth, len, err, next, uninitialized = 0; - BUG_ON(newext->ee_len == 0); + BUG_ON(ext4_ext_get_actual_len(newext) == 0); depth = ext_depth(inode); ex = path[depth].p_ext; BUG_ON(path[depth].p_hdr == NULL); @@ -1142,13 +1154,22 @@ /* try to insert block into found extent and return */ if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append %d block to %d:%d (from %llu)\n", - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); if ((err = ext4_ext_get_access(handle, inode, path + depth))) return err; - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) - + le16_to_cpu(newext->ee_len)); + + /* ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check any of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if(uninitialized) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1203,7 +1224,7 @@ ext_debug("first extent in the leaf: %d:%llu:%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len)); + ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { @@ -1216,7 +1237,7 @@ "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); } @@ -1229,7 +1250,7 @@ "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; @@ -1248,8 +1269,13 @@ if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1)) break; /* merge with next extent! */ - nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len) - + le16_to_cpu(nearex[1].ee_len)); + if (ext4_ext_is_uninitialized(nearex)) + uninitialized = 1; + nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex) + + ext4_ext_get_actual_len(nearex + 1)); + if(uninitialized) + ext4_ext_mark_uninitialized(nearex); + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - nearex - 1) * sizeof(struct ext4_extent); @@ -1319,8 +1345,8 @@ end = le32_to_cpu(ex->ee_block); if (block + num < end) end = block + num; - } else if (block >= - le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { /* need to allocate space after found extent */ start = block; end = block + num; @@ -1332,7 +1358,8 @@ * by found extent */ start = block; - end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); if (block + num < end) end = block + num; exists = 1; @@ -1348,7 +1375,7 @@ cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = le16_to_cpu(ex->ee_len); + cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext_pblock(ex); cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } @@ -1556,12 +1583,12 @@ unsigned long from, unsigned long to) { struct buffer_head *bh; + unsigned short ee_len = ext4_ext_get_actual_len(ex); int i; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = le16_to_cpu(ex->ee_len); spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; @@ -1575,12 +1602,12 @@ } #endif if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ unsigned long num; ext4_fsblk_t start; - num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; - start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num; + num = le32_to_cpu(ex->ee_block) + ee_len - from; + start = ext_pblock(ex) + ee_len - num; ext_debug("free last %lu blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); @@ -1588,12 +1615,12 @@ } ext4_free_blocks(handle, inode, start, num); } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } else { printk("strange request: removal(2) %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } return 0; } @@ -1607,7 +1634,7 @@ struct ext4_extent_header *eh; unsigned a, b, block, num; unsigned long ex_ee_block; - unsigned short ex_ee_len; + unsigned short ex_ee_len, uninitialized = 0; struct ext4_extent *ex; ext_debug("truncate since %lu in leaf\n", start); @@ -1622,7 +1649,9 @@ ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + if(ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex_ee_len = ext4_ext_get_actual_len(ex); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -1690,6 +1719,8 @@ ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); + if(uninitialized) + ext4_ext_mark_uninitialized(ex); err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -1699,7 +1730,7 @@ ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) @@ -1974,7 +2005,7 @@ if ((ex = path[depth].p_ext)) { unsigned long ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext_pblock(ex); - unsigned short ee_len = le16_to_cpu(ex->ee_len); + unsigned short ee_len; /* * Allow future support for preallocated extents to be added @@ -1982,8 +2013,11 @@ * Uninitialized extents are treated as holes, except that * we avoid (fail) allocating new blocks during a write. */ - if (ee_len > EXT_MAX_LEN) + if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN + && create != EXT4_CREATE_UNINITIALIZED_EXT) goto out2; + + ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (iblock >= ee_block && iblock < ee_block + ee_len) { newblock = iblock - ee_block + ee_start; @@ -1991,8 +2025,10 @@ allocated = ee_len - (iblock - ee_block); ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, ee_block, ee_len, newblock); - ext4_ext_put_in_cache(inode, ee_block, ee_len, - ee_start, EXT4_EXT_CACHE_EXTENT); + /* Do not put uninitialized extent in the cache */ + if(!ext4_ext_is_uninitialized(ex)) + ext4_ext_put_in_cache(inode, ee_block, ee_len, + ee_start, EXT4_EXT_CACHE_EXTENT); goto out; } } @@ -2027,6 +2063,8 @@ newex.ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(allocated); + if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) goto out2; @@ -2038,8 +2076,10 @@ newblock = ext_pblock(&newex); __set_bit(BH_New, &bh_result->b_state); - ext4_ext_put_in_cache(inode, iblock, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + /* Cache only when it is _not_ an uninitialized extent */ + if(create!=EXT4_CREATE_UNINITIALIZED_EXT) + ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT4_EXT_CACHE_EXTENT); out: if (allocated > max_blocks) allocated = max_blocks; Index: linux-2.6.19.prealloc/fs/ext4/ioctl.c =================================================================== --- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-15 16:44:35.000000000 +0530 +++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-15 17:47:00.000000000 +0530 @@ -248,6 +248,65 @@ return err; } + case EXT4_IOC_PREALLOCATE: { + struct ext4_falloc_input input; + handle_t *handle; + ext4_fsblk_t block, max_blocks; + int ret, ret2, nblocks = 0, retries = 0; + struct buffer_head map_bh; + unsigned int blkbits = inode->i_blkbits; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (copy_from_user(&input, + (struct ext4_falloc_input __user *) arg, sizeof(input))) + return -EFAULT; + + if (input.len == 0) + return -EINVAL; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -ENOTTY; + + block = input.offset >> blkbits; + max_blocks = (EXT4_BLOCK_ALIGN(input.len + input.offset, + blkbits) >> blkbits) - block; + handle=ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks); + if (IS_ERR(handle)) + return PTR_ERR(handle); +retry: + ret = 0; + while(ret>=0 && ret<max_blocks) + { + block = block + ret; + max_blocks = max_blocks - ret; + ret = ext4_ext_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 0); + if(ret > 0 && test_bit(BH_New, &map_bh.b_state)) + nblocks = nblocks + ret; + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, + &retries)) + goto retry; + + if(nblocks) { + mutex_lock(&inode->i_mutex); + inode->i_size = inode->i_size + (nblocks >> blkbits); + EXT4_I(inode)->i_disksize = inode->i_size; + mutex_unlock(&inode->i_mutex); + } + + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if(ret > 0) + ret = ret2; + + return ret > 0 ? nblocks : ret; + } + default: return -ENOTTY; } Index: linux-2.6.19.prealloc/include/linux/ext4_fs.h =================================================================== --- linux-2.6.19.prealloc.orig/include/linux/ext4_fs.h 2006-12-15 16:44:35.000000000 +0530 +++ linux-2.6.19.prealloc/include/linux/ext4_fs.h 2006-12-15 16:52:18.000000000 +0530 @@ -102,6 +102,8 @@ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif +#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \ + (~((1 << blkbits)-1))) /* * Macro-instructions used to manage fragments @@ -225,6 +227,16 @@ __u32 free_blocks_count; }; +/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */ +struct ext4_falloc_input { + __u64 offset; + __u64 len; +}; + +/* Following is used by preallocation logic to tell get_blocks() that we + * want uninitialzed extents. + */ +#define EXT4_CREATE_UNINITIALIZED_EXT 2 /* * ioctl commands @@ -242,6 +254,7 @@ #endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input) /* * ioctl commands in 32 bit emulation Index: linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h =================================================================== --- linux-2.6.19.prealloc.orig/include/linux/ext4_fs_extents.h 2006-12-15 16:44:35.000000000 +0530 +++ linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h 2006-12-15 17:50:08.000000000 +0530 @@ -125,6 +125,19 @@ #define EXT4_EXT_CACHE_EXTENT 2 /* + * Macro-instructions to handle (mark/unmark/check/create) unitialized + * extents. Applications can issue an IOCTL for preallocation, which results + * in assigning unitialized extents to the file. + */ +#define ext4_ext_mark_uninitialized(ext) ((ext)->ee_len |= \ + cpu_to_le16(0x8000)) +#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \ + 0x8000) +#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \ + 0x7FFF) + + +/* * to be called by ext4_ext_walk_space() * negative retcode - error * positive retcode - signal for ext4_ext_walk_space(), see below ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/2] Persistent preallocation in ext4 2006-12-15 12:35 ` [RFC][Patch 1/2] " Amit K. Arora @ 2006-12-19 11:05 ` Amit K. Arora [not found] ` <20061219211206.GO5937@schatzie.adilger.int> 2006-12-27 23:30 ` Mingming Cao 1 sibling, 1 reply; 26+ messages in thread From: Amit K. Arora @ 2006-12-19 11:05 UTC (permalink / raw) To: linux-ext4; +Cc: suparna, cmm, suzuki, alex On Fri, Dec 15, 2006 at 06:05:28PM +0530, Amit K. Arora wrote: > --- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-15 16:44:35.000000000 +0530 > +++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-15 17:47:00.000000000 +0530 : : > + handle=ext4_journal_start(inode, > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks); The current way how buffer credits are passed to ext4_journal_start() above, is not correct. The max. number of blocks that we might modify here should be calculated using ext4_ext_calc_credits_for_insert(). Thus the above line should be replaced with : mutex_lock(&EXT4_I(inode)->truncate_mutex); credits = ext4_ext_calc_credits_for_insert(inode, NULL); mutex_unlock(&EXT4_I(inode)->truncate_mutex); handle=ext4_journal_start(inode, credits + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 1); Following is the revised patch with the above change. Signed-off-by: Amit Arora (aarora@in.ibm.com) --- fs/ext4/extents.c | 116 ++++++++++++++++++++++++++-------------- fs/ext4/ioctl.c | 63 +++++++++++++++++++++ include/linux/ext4_fs.h | 13 ++++ include/linux/ext4_fs_extents.h | 13 ++++ 4 files changed, 167 insertions(+), 38 deletions(-) Index: linux-2.6.19.prealloc/fs/ext4/extents.c =================================================================== --- linux-2.6.19.prealloc.orig/fs/ext4/extents.c 2006-12-19 16:09:00.000000000 +0530 +++ linux-2.6.19.prealloc/fs/ext4/extents.c 2006-12-19 16:23:37.000000000 +0530 @@ -282,7 +282,7 @@ } else if (path->p_ext) { ext_debug(" %d:%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - le16_to_cpu(path->p_ext->ee_len), + ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else ext_debug(" []"); @@ -305,7 +305,7 @@ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); } @@ -425,7 +425,7 @@ ext_debug(" -> %d:%llu:%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), - le16_to_cpu(path->p_ext->ee_len)); + ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { @@ -686,7 +686,7 @@ ext_debug("move %d:%llu:%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), - le16_to_cpu(path[depth].p_ext->ee_len), + ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, sizeof(struct ext4_extent)); @@ -1097,7 +1097,19 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { - if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != + unsigned short ext1_ee_len, ext2_ee_len; + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; @@ -1106,14 +1118,14 @@ * as an RO_COMPAT feature, refuse to merge to extents if * this can result in the top bit of ee_len being set. */ - if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN) + if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN) return 0; #ifdef AGRESSIVE_TEST if (le16_to_cpu(ex1->ee_len) >= 4) return 0; #endif - if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2)) + if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) return 1; return 0; } @@ -1132,9 +1144,9 @@ struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; - int depth, len, err, next; + int depth, len, err, next, uninitialized = 0; - BUG_ON(newext->ee_len == 0); + BUG_ON(ext4_ext_get_actual_len(newext) == 0); depth = ext_depth(inode); ex = path[depth].p_ext; BUG_ON(path[depth].p_hdr == NULL); @@ -1142,13 +1154,22 @@ /* try to insert block into found extent and return */ if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append %d block to %d:%d (from %llu)\n", - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - le16_to_cpu(ex->ee_len), ext_pblock(ex)); + ext4_ext_get_actual_len(ex), ext_pblock(ex)); if ((err = ext4_ext_get_access(handle, inode, path + depth))) return err; - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) - + le16_to_cpu(newext->ee_len)); + + /* ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check any of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if(uninitialized) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1203,7 +1224,7 @@ ext_debug("first extent in the leaf: %d:%llu:%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len)); + ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { @@ -1216,7 +1237,7 @@ "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); } @@ -1229,7 +1250,7 @@ "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), - le16_to_cpu(newext->ee_len), + ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; @@ -1248,8 +1269,13 @@ if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1)) break; /* merge with next extent! */ - nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len) - + le16_to_cpu(nearex[1].ee_len)); + if (ext4_ext_is_uninitialized(nearex)) + uninitialized = 1; + nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex) + + ext4_ext_get_actual_len(nearex + 1)); + if(uninitialized) + ext4_ext_mark_uninitialized(nearex); + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - nearex - 1) * sizeof(struct ext4_extent); @@ -1319,8 +1345,8 @@ end = le32_to_cpu(ex->ee_block); if (block + num < end) end = block + num; - } else if (block >= - le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { /* need to allocate space after found extent */ start = block; end = block + num; @@ -1332,7 +1358,8 @@ * by found extent */ start = block; - end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); if (block + num < end) end = block + num; exists = 1; @@ -1348,7 +1375,7 @@ cbex.ec_type = EXT4_EXT_CACHE_GAP; } else { cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = le16_to_cpu(ex->ee_len); + cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_start = ext_pblock(ex); cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } @@ -1556,12 +1583,12 @@ unsigned long from, unsigned long to) { struct buffer_head *bh; + unsigned short ee_len = ext4_ext_get_actual_len(ex); int i; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = le16_to_cpu(ex->ee_len); spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; @@ -1575,12 +1602,12 @@ } #endif if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ unsigned long num; ext4_fsblk_t start; - num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; - start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num; + num = le32_to_cpu(ex->ee_block) + ee_len - from; + start = ext_pblock(ex) + ee_len - num; ext_debug("free last %lu blocks starting %llu\n", num, start); for (i = 0; i < num; i++) { bh = sb_find_get_block(inode->i_sb, start + i); @@ -1588,12 +1615,12 @@ } ext4_free_blocks(handle, inode, start, num); } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } else { printk("strange request: removal(2) %lu-%lu from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + from, to, le32_to_cpu(ex->ee_block), ee_len); } return 0; } @@ -1607,7 +1634,7 @@ struct ext4_extent_header *eh; unsigned a, b, block, num; unsigned long ex_ee_block; - unsigned short ex_ee_len; + unsigned short ex_ee_len, uninitialized = 0; struct ext4_extent *ex; ext_debug("truncate since %lu in leaf\n", start); @@ -1622,7 +1649,9 @@ ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + if(ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex_ee_len = ext4_ext_get_actual_len(ex); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -1690,6 +1719,8 @@ ex->ee_block = cpu_to_le32(block); ex->ee_len = cpu_to_le16(num); + if(uninitialized) + ext4_ext_mark_uninitialized(ex); err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -1699,7 +1730,7 @@ ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = le16_to_cpu(ex->ee_len); + ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) @@ -1974,7 +2005,7 @@ if ((ex = path[depth].p_ext)) { unsigned long ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext_pblock(ex); - unsigned short ee_len = le16_to_cpu(ex->ee_len); + unsigned short ee_len; /* * Allow future support for preallocated extents to be added @@ -1982,8 +2013,11 @@ * Uninitialized extents are treated as holes, except that * we avoid (fail) allocating new blocks during a write. */ - if (ee_len > EXT_MAX_LEN) + if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN + && create != EXT4_CREATE_UNINITIALIZED_EXT) goto out2; + + ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (iblock >= ee_block && iblock < ee_block + ee_len) { newblock = iblock - ee_block + ee_start; @@ -1991,8 +2025,10 @@ allocated = ee_len - (iblock - ee_block); ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, ee_block, ee_len, newblock); - ext4_ext_put_in_cache(inode, ee_block, ee_len, - ee_start, EXT4_EXT_CACHE_EXTENT); + /* Do not put uninitialized extent in the cache */ + if(!ext4_ext_is_uninitialized(ex)) + ext4_ext_put_in_cache(inode, ee_block, ee_len, + ee_start, EXT4_EXT_CACHE_EXTENT); goto out; } } @@ -2027,6 +2063,8 @@ newex.ee_block = cpu_to_le32(iblock); ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(allocated); + if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) goto out2; @@ -2038,8 +2076,10 @@ newblock = ext_pblock(&newex); __set_bit(BH_New, &bh_result->b_state); - ext4_ext_put_in_cache(inode, iblock, allocated, newblock, - EXT4_EXT_CACHE_EXTENT); + /* Cache only when it is _not_ an uninitialized extent */ + if(create!=EXT4_CREATE_UNINITIALIZED_EXT) + ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT4_EXT_CACHE_EXTENT); out: if (allocated > max_blocks) allocated = max_blocks; Index: linux-2.6.19.prealloc/fs/ext4/ioctl.c =================================================================== --- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-19 16:09:00.000000000 +0530 +++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-19 16:23:08.000000000 +0530 @@ -11,6 +11,7 @@ #include <linux/jbd2.h> #include <linux/capability.h> #include <linux/ext4_fs.h> +#include <linux/ext4_fs_extents.h> #include <linux/ext4_jbd2.h> #include <linux/time.h> #include <linux/compat.h> @@ -248,6 +249,68 @@ return err; } + case EXT4_IOC_PREALLOCATE: { + struct ext4_falloc_input input; + handle_t *handle; + ext4_fsblk_t block, max_blocks; + int ret, ret2, nblocks = 0, retries = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (copy_from_user(&input, + (struct ext4_falloc_input __user *) arg, sizeof(input))) + return -EFAULT; + + if (input.len == 0) + return -EINVAL; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -ENOTTY; + + block = input.offset >> blkbits; + max_blocks = (EXT4_BLOCK_ALIGN(input.len + input.offset, + blkbits) >> blkbits) - block; + mutex_lock(&EXT4_I(inode)->truncate_mutex); + credits = ext4_ext_calc_credits_for_insert(inode, NULL); + mutex_unlock(&EXT4_I(inode)->truncate_mutex); + handle=ext4_journal_start(inode, credits + + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); +retry: + ret = 0; + while(ret>=0 && ret<max_blocks) + { + block = block + ret; + max_blocks = max_blocks - ret; + ret = ext4_ext_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 0); + if(ret > 0 && test_bit(BH_New, &map_bh.b_state)) + nblocks = nblocks + ret; + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, + &retries)) + goto retry; + + if(nblocks) { + mutex_lock(&inode->i_mutex); + inode->i_size = inode->i_size + (nblocks >> blkbits); + EXT4_I(inode)->i_disksize = inode->i_size; + mutex_unlock(&inode->i_mutex); + } + + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if(ret > 0) + ret = ret2; + + return ret > 0 ? nblocks : ret; + } + default: return -ENOTTY; } Index: linux-2.6.19.prealloc/include/linux/ext4_fs.h =================================================================== --- linux-2.6.19.prealloc.orig/include/linux/ext4_fs.h 2006-12-19 16:09:00.000000000 +0530 +++ linux-2.6.19.prealloc/include/linux/ext4_fs.h 2006-12-19 16:09:12.000000000 +0530 @@ -102,6 +102,8 @@ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif +#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \ + (~((1 << blkbits)-1))) /* * Macro-instructions used to manage fragments @@ -225,6 +227,16 @@ __u32 free_blocks_count; }; +/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */ +struct ext4_falloc_input { + __u64 offset; + __u64 len; +}; + +/* Following is used by preallocation logic to tell get_blocks() that we + * want uninitialzed extents. + */ +#define EXT4_CREATE_UNINITIALIZED_EXT 2 /* * ioctl commands @@ -242,6 +254,7 @@ #endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input) /* * ioctl commands in 32 bit emulation Index: linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h =================================================================== --- linux-2.6.19.prealloc.orig/include/linux/ext4_fs_extents.h 2006-12-19 16:09:00.000000000 +0530 +++ linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h 2006-12-19 16:23:37.000000000 +0530 @@ -125,6 +125,19 @@ #define EXT4_EXT_CACHE_EXTENT 2 /* + * Macro-instructions to handle (mark/unmark/check/create) unitialized + * extents. Applications can issue an IOCTL for preallocation, which results + * in assigning unitialized extents to the file. + */ +#define ext4_ext_mark_uninitialized(ext) ((ext)->ee_len |= \ + cpu_to_le16(0x8000)) +#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \ + 0x8000) +#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \ + 0x7FFF) + + +/* * to be called by ext4_ext_walk_space() * negative retcode - error * positive retcode - signal for ext4_ext_walk_space(), see below ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20061219211206.GO5937@schatzie.adilger.int>]
* Re: [RFC][Patch 1/2] Persistent preallocation in ext4 [not found] ` <20061219211206.GO5937@schatzie.adilger.int> @ 2006-12-20 6:28 ` Amit K. Arora 0 siblings, 0 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-20 6:28 UTC (permalink / raw) To: Andreas Dilger; +Cc: linux-ext4 On Tue, Dec 19, 2006 at 02:12:06PM -0700, Andreas Dilger wrote: > Minor edits (not worth a resubmit by itself): Thanks, Andreas ! I will take care of these comments in the next submission. Regards, Amit Arora > > On Dec 19, 2006 16:35 +0530, Amit K. Arora wrote: > > + /* ext4_can_extents_be_merged should have checked that either > > + * both extents are uninitialized, or both aren't. Thus we > > + * need to check any of them here. > > s/any/only one/ > > > > > + case EXT4_IOC_PREALLOCATE: { > > + if (IS_RDONLY(inode)) > > + return -EROFS; > > + > > + if (copy_from_user(&input, > > + (struct ext4_falloc_input __user *) arg, sizeof(input))) > > + return -EFAULT; > > + > > + if (input.len == 0) > > + return -EINVAL; > > + > > + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) > > + return -ENOTTY; > > May as well put this check before copy_from_user(), since it doesn't need > the user data and is much faster to check first. > > > +retry: > > + ret = 0; > > + while(ret>=0 && ret<max_blocks) > > + { > > Opening brace always on same line, like "while() {" > > > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, > > + &retries)) > > &retries should be aligned with start of (inode->i_sb, on previous line. > > > + if(nblocks) { > > Space between "if (" everywhere. > > Cheers, Andreas > -- > Andreas Dilger > Principal Software Engineer > Cluster File Systems, Inc. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/2] Persistent preallocation in ext4 2006-12-15 12:35 ` [RFC][Patch 1/2] " Amit K. Arora 2006-12-19 11:05 ` Amit K. Arora @ 2006-12-27 23:30 ` Mingming Cao 2007-01-02 11:04 ` Amit K. Arora 1 sibling, 1 reply; 26+ messages in thread From: Mingming Cao @ 2006-12-27 23:30 UTC (permalink / raw) To: Amit K. Arora; +Cc: linux-ext4, suparna, suzuki, alex On Fri, 2006-12-15 at 18:05 +0530, Amit K. Arora wrote: > This is the first patch in the set of two. > > It implements the ioctl which will be used for persistent preallocation. It is a respun of the previous patch which was posted earlier, and includes following changes: > * Takes care of review comments by Mingming > * The declaration of extent related macros are now moved to ext4_fs_extent.h (from ext4_fs.h) > * Updated the logic to calculate block and max_blocks in ext4/ioctl.c, which is used to call get_blocks. > > It does _not_ take care of implementing persistent preallocation for non-extent based files. It is because of the following reasons: > * It is being considered as a rare case > * Users can/should convert their file(s) to extent format to use this feature > * Moreover, posix_fallocate() can be used for this purpose, if the user does not want to convert the file(s) to the extent based format. > > > Signed-off-by: Amit Arora (aarora@in.ibm.com) > Hi Amit, looks good to me, a few comments :) ..... > Index: linux-2.6.19.prealloc/fs/ext4/ioctl.c > =================================================================== > --- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-15 16:44:35.000000000 +0530 > +++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-15 17:47:00.000000000 +0530 > @@ -248,6 +248,65 @@ > return err; > } > > + case EXT4_IOC_PREALLOCATE: { > + struct ext4_falloc_input input; > + handle_t *handle; > + ext4_fsblk_t block, max_blocks; > + int ret, ret2, nblocks = 0, retries = 0; > + struct buffer_head map_bh; > + unsigned int blkbits = inode->i_blkbits; > + > + if (IS_RDONLY(inode)) > + return -EROFS; > + > + if (copy_from_user(&input, > + (struct ext4_falloc_input __user *) arg, sizeof(input))) > + return -EFAULT; > + > + if (input.len == 0) > + return -EINVAL; > + > + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) > + return -ENOTTY; > + > + block = input.offset >> blkbits; > + max_blocks = (EXT4_BLOCK_ALIGN(input.len + input.offset, > + blkbits) >> blkbits) - block; > + handle=ext4_journal_start(inode, > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks); > + if (IS_ERR(handle)) > + return PTR_ERR(handle); > +retry: > + ret = 0; > + while(ret>=0 && ret<max_blocks) > + { > + block = block + ret; > + max_blocks = max_blocks - ret; > + ret = ext4_ext_get_blocks(handle, inode, block, > + max_blocks, &map_bh, > + EXT4_CREATE_UNINITIALIZED_EXT, 0); > + if(ret > 0 && test_bit(BH_New, &map_bh.b_state)) > + nblocks = nblocks + ret; > + } ext4_ext_get_blocks() returns 0 when it is mapping (non allocating) a hole. In our case, we are doing allocating, so here it is not possible to returns a 0 from ext4_ext_get_blocks(). I think we should quit the loop and BUGON if ret == 0 here. > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, > + &retries)) > + goto retry; > + > + if(nblocks) { > + mutex_lock(&inode->i_mutex); > + inode->i_size = inode->i_size + (nblocks >> blkbits); > + EXT4_I(inode)->i_disksize = inode->i_size; > + mutex_unlock(&inode->i_mutex); > + } Hmm... We should not need to worry about the inode->i_size if we are preallocating blocks for holes. And, Looking at other places calling ext4_*_get_blocks() in the kernel, it seems not all of them protected by i_mutex lock. I think it probably okay to not holding i_mutex during calling ext4_ext4_get_blocks(). > + > + ext4_mark_inode_dirty(handle, inode); > + ret2 = ext4_journal_stop(handle); > + if(ret > 0) > + ret = ret2; > + > + return ret > 0 ? nblocks : ret; > + } > + Since the API takes the number of bytes to preallocate, at return time, shall we convert the blocks to bytes to the user? Here it returns the number of allocated blocks to the user. Do we need to worry about the case when dealing with a range with partial hole and partial blocks already allocated? In that case nblocks(the new preallocated blocks) will less than the maxblocks (the number of blocks asked by application). I am wondering what does other filesystem like xfs do? Maybe we should do the same thing. Thanks, Mingming ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/2] Persistent preallocation in ext4 2006-12-27 23:30 ` Mingming Cao @ 2007-01-02 11:04 ` Amit K. Arora 2007-01-02 22:47 ` Mingming Cao 2007-01-09 9:05 ` Amit K. Arora 0 siblings, 2 replies; 26+ messages in thread From: Amit K. Arora @ 2007-01-02 11:04 UTC (permalink / raw) To: Mingming Cao; +Cc: linux-ext4, suparna, suzuki, alex Hi Mingming, On Wed, Dec 27, 2006 at 03:30:44PM -0800, Mingming Cao wrote: > looks good to me, a few comments :) Thanks for your comments! > ..... > > + ret = ext4_ext_get_blocks(handle, inode, block, > > + max_blocks, &map_bh, > > + EXT4_CREATE_UNINITIALIZED_EXT, 0); > > + if(ret > 0 && test_bit(BH_New, &map_bh.b_state)) > > + nblocks = nblocks + ret; > > + } > > > ext4_ext_get_blocks() returns 0 when it is mapping (non allocating) a > hole. In our case, we are doing allocating, so here it is not possible > to returns a 0 from ext4_ext_get_blocks(). I think we should quit the > loop and BUGON if ret == 0 here. Okay. I will add "BUG_ON(!ret);" just after get_blocks, above. > > > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, > > + &retries)) > > + goto retry; > > + > > + if(nblocks) { > > + mutex_lock(&inode->i_mutex); > > + inode->i_size = inode->i_size + (nblocks >> blkbits); > > + EXT4_I(inode)->i_disksize = inode->i_size; > > + mutex_unlock(&inode->i_mutex); > > + } > > Hmm... We should not need to worry about the inode->i_size if we are > preallocating blocks for holes. You are right. Will take care of this. > And, Looking at other places calling ext4_*_get_blocks() in the kernel, > it seems not all of them protected by i_mutex lock. I think it probably > okay to not holding i_mutex during calling ext4_ext4_get_blocks(). We are not holding i_mutex lock during ext4_ext_get_blocks() call. Above, this lock is being held inorder to avoid race while updating the filesize in inode (reference your comment in a previous mail "I think we should update i_size and i_disksize after preallocation. Oh, to protect parallel updating i_size, we have to take i_mutex down."). Perhaps, truncate_mutex lock should be used here, and not i_mutex. > > > + > > + ext4_mark_inode_dirty(handle, inode); > > + ret2 = ext4_journal_stop(handle); > > + if(ret > 0) > > + ret = ret2; > > + > > + return ret > 0 ? nblocks : ret; > > + } > > + > > Since the API takes the number of bytes to preallocate, at return time, > shall we convert the blocks to bytes to the user? > > Here it returns the number of allocated blocks to the user. Do we need > to worry about the case when dealing with a range with partial hole and > partial blocks already allocated? In that case nblocks(the new > preallocated blocks) will less than the maxblocks (the number of blocks > asked by application). I am wondering what does other filesystem like > xfs do? Maybe we should do the same thing. I think xfs just returns 0 on success, and errno on an error. Do we want to keep the same behavior here ? Or, should we return the number of bytes preallocated ? Thanks! -- Regards, Amit Arora ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/2] Persistent preallocation in ext4 2007-01-02 11:04 ` Amit K. Arora @ 2007-01-02 22:47 ` Mingming Cao 2007-01-09 9:05 ` Amit K. Arora 1 sibling, 0 replies; 26+ messages in thread From: Mingming Cao @ 2007-01-02 22:47 UTC (permalink / raw) To: Amit K. Arora; +Cc: linux-ext4, suparna, suzuki, alex On Tue, 2007-01-02 at 16:34 +0530, Amit K. Arora wrote: > Hi Mingming, > > And, Looking at other places calling ext4_*_get_blocks() in the kernel, > > it seems not all of them protected by i_mutex lock. I think it probably > > okay to not holding i_mutex during calling ext4_ext4_get_blocks(). > > We are not holding i_mutex lock during ext4_ext_get_blocks() call. > Above, this lock is being held inorder to avoid race while updating the > filesize in inode (reference your comment in a previous mail "I think we > should update i_size and i_disksize after preallocation. Oh, > to protect parallel updating i_size, we have to take i_mutex down."). > Perhaps, truncate_mutex lock should be used here, and not i_mutex. > truncate_mutex is hold at the entry of ext4_**_get_blocks() to protect parallel block allocation. Here I was worried about concurrent modifying i_size and i_disksize which were protected by the i_mutex lock. Sorry for any confusion. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 1/2] Persistent preallocation in ext4 2007-01-02 11:04 ` Amit K. Arora 2007-01-02 22:47 ` Mingming Cao @ 2007-01-09 9:05 ` Amit K. Arora 1 sibling, 0 replies; 26+ messages in thread From: Amit K. Arora @ 2007-01-09 9:05 UTC (permalink / raw) To: Mingming Cao; +Cc: linux-ext4, suparna, suzuki, alex On Tue, Jan 02, 2007 at 04:34:09PM +0530, Amit K. Arora wrote: > On Wed, Dec 27, 2006 at 03:30:44PM -0800, Mingming Cao wrote: > > Since the API takes the number of bytes to preallocate, at return time, > > shall we convert the blocks to bytes to the user? > > > > Here it returns the number of allocated blocks to the user. Do we need > > to worry about the case when dealing with a range with partial hole and > > partial blocks already allocated? In that case nblocks(the new > > preallocated blocks) will less than the maxblocks (the number of blocks > > asked by application). I am wondering what does other filesystem like > > xfs do? Maybe we should do the same thing. > > I think xfs just returns 0 on success, and errno on an error. Do we > want to keep the same behavior here ? Or, should we return the number of > bytes preallocated ? We still need to decide on what the ioctl should return. Should it return zero on success and errno on error, like how posix_fallocate and xfs behave ? If yes, then should we undo partial preallocation (if any) in case of an error (say ENOSPC) ? If no, then should we return the number of bytes preallocated ? In this case we have to think about the situation Mingming mentioned above (i.e. when the preallocation request partially spans through a hole and partially through few already allocated blocks). -- Regards, Amit Arora ^ permalink raw reply [flat|nested] 26+ messages in thread
* [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-05 13:43 [RFC][Patch 1/1] Persistent preallocation in ext4 Amit K. Arora 2006-12-06 5:58 ` Amit K. Arora @ 2006-12-15 12:39 ` Amit K. Arora 2006-12-15 23:02 ` Andreas Dilger 2006-12-19 11:42 ` Amit K. Arora 1 sibling, 2 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-15 12:39 UTC (permalink / raw) To: linux-ext4; +Cc: suparna, cmm, suzuki, alex This is the second patch in the set. This patch makes writing to the unitialized extent possible. A write operation on an unitialized extent *may* (depending on the relative block location in the extent and number of blocks being written) result in spliting the extent. There are three possibilities: 1. The extent does not split : This will happen when the entire extent is being written to. In this case the extent will be marked "initialized" and merged (if possible) with the neighbouring extents in the tree. 2. The extent splits in two portions : This will happen when someone is writing to any one end of the extent (i.e. not in the middle, and not to the entire extent). This will result in breaking the extent in two portions, an initialized extent (the set of blocks being written to) and an uninitialized extent (rest of the blocks in the parent extent). 3. The extent is split in three parts: This occurs when someone writes in the middle of the extent. It will result into three extents, two uninitialized (at the both ends) and one initialized (in middle). Since the extent merge logic was getting redundant, it has been put into a new function ext4_ext_try_to_merge(). This gets called from ext4_ext_insert_extent() and ext4_ext_get_blocks(), when required. Signed-off-by: Amit Arora (aarora@in.ibm.com) --- fs/ext4/extents.c | 200 ++++++++++++++++++++++++++++++++++------ include/linux/ext4_fs_extents.h | 1 2 files changed, 171 insertions(+), 30 deletions(-) Index: linux-2.6.19.prealloc/fs/ext4/extents.c =================================================================== --- linux-2.6.19.prealloc.orig/fs/ext4/extents.c 2006-12-15 17:53:48.000000000 +0530 +++ linux-2.6.19.prealloc/fs/ext4/extents.c 2006-12-15 17:54:32.000000000 +0530 @@ -1131,6 +1131,50 @@ } /* + * ext4_ext_try_to_merge: + * tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done=0, uninitialized = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if(uninitialized) + ext4_ext_mark_uninitialized(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); + merge_done = 1; + BUG_ON(eh->eh_entries == 0); + } + + return merge_done; +} + +/* * ext4_ext_insert_extent: * tries to merge requsted extent into the existing extent or * inserts requested extent as new one into the tree, @@ -1265,25 +1309,7 @@ merge: /* try to merge extents to the right */ - while (nearex < EXT_LAST_EXTENT(eh)) { - if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1)) - break; - /* merge with next extent! */ - if (ext4_ext_is_uninitialized(nearex)) - uninitialized = 1; - nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex) - + ext4_ext_get_actual_len(nearex + 1)); - if(uninitialized) - ext4_ext_mark_uninitialized(nearex); - - if (nearex + 1 < EXT_LAST_EXTENT(eh)) { - len = (EXT_LAST_EXTENT(eh) - nearex - 1) - * sizeof(struct ext4_extent); - memmove(nearex + 1, nearex + 2, len); - } - eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); - BUG_ON(eh->eh_entries == 0); - } + ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -1952,9 +1978,10 @@ int create, int extend_disksize) { struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex; + struct ext4_extent_header *eh; + struct ext4_extent newex, *ex, *ex1 = NULL, *ex2 = NULL, *ex3 = NULL; ext4_fsblk_t goal, newblock; - int err = 0, depth; + int err = 0, depth, ret; unsigned long allocated = 0; __clear_bit(BH_New, &bh_result->b_state); @@ -2001,6 +2028,7 @@ * this is why assert can't be put in ext4_ext_find_extent() */ BUG_ON(path[depth].p_ext == NULL && depth != 0); + eh = path[depth].p_hdr; if ((ex = path[depth].p_ext)) { unsigned long ee_block = le32_to_cpu(ex->ee_block); @@ -2008,15 +2036,9 @@ unsigned short ee_len; /* - * Allow future support for preallocated extents to be added - * as an RO_COMPAT feature: * Uninitialized extents are treated as holes, except that - * we avoid (fail) allocating new blocks during a write. + * we split out initialized portions during a write. */ - if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN - && create != EXT4_CREATE_UNINITIALIZED_EXT) - goto out2; - ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ if (iblock >= ee_block && iblock < ee_block + ee_len) { @@ -2026,10 +2048,126 @@ ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, ee_block, ee_len, newblock); /* Do not put uninitialized extent in the cache */ - if(!ext4_ext_is_uninitialized(ex)) + if(!ext4_ext_is_uninitialized(ex)) { ext4_ext_put_in_cache(inode, ee_block, ee_len, ee_start, EXT4_EXT_CACHE_EXTENT); - goto out; + goto out; + } + if(create == EXT4_CREATE_UNINITIALIZED_EXT) + goto out; + if (!create) + goto out2; + + /* Initializing write within an uninitialized extent */ + /* Split in upto 3 extents . + * There are three possibilities: + * a> No split required: Entire extent should be + * initialized. + * b> Split into two extents: Only one end of the + * extent is being written to. + * c> Split into three extents: Somone is writing + * in middle of the extent. + */ + ex2 = ex; + + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + + max_blocks); + ext4_ext_store_pblock(ex3, newblock + + max_blocks); + ex3->ee_len = cpu_to_le16(allocated + - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, + path, ex3); + if (err) + goto out2; + + /* The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + if(newdepth != depth) + { + depth=newdepth; + path = ext4_ext_find_extent(inode, + iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ex2 = ex; + } + allocated = max_blocks; + } + + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + + /* ex2: iblock to iblock + maxblocks-1 : initialised */ + ex2->ee_block = cpu_to_le32(iblock); + ex2->ee_start = newblock; + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + if (ex2 != ex) + goto insert; + + if ((err = ext4_ext_get_access(handle, inode, + path + depth))) + goto out2; + + /* New (initialized) extent starts from the first block + * in the current extent. i.e., ex2 == ex + * We have to see if it can be merged with the extent + * on the left. + */ + if(ex2 > EXT_FIRST_EXTENT(eh)) { + /* To merge left, pass "ex2 - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, + path, ex2 - 1); + if(ret) { + err = ext4_ext_correct_indexes(handle, + inode, path); + if (err) + goto out2; + depth = ext_depth(inode); + ex2--; + } + } + + /* Try to Merge towards right. This might be required + * only when the whole extent is being written to. + * i.e. ex2==ex and ex3==NULL. + */ + if(!ex3) { + ret = ext4_ext_try_to_merge(inode, path, ex2); + if(ret) { + err = ext4_ext_correct_indexes(handle, + inode, path); + if (err) + goto out2; + } + } + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, + path + depth); + if (err) + goto out2; + goto outnew; } } @@ -2065,6 +2203,7 @@ newex.ee_len = cpu_to_le16(allocated); if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); +insert: err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) goto out2; @@ -2074,6 +2213,7 @@ /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); +outnew: __set_bit(BH_New, &bh_result->b_state); /* Cache only when it is _not_ an uninitialized extent */ Index: linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h =================================================================== --- linux-2.6.19.prealloc.orig/include/linux/ext4_fs_extents.h 2006-12-15 17:50:08.000000000 +0530 +++ linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h 2006-12-15 17:54:17.000000000 +0530 @@ -203,6 +203,7 @@ extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_try_to_merge(struct inode *, struct ext4_ext_path *, struct ext4_extent *); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *); extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *); ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-15 12:39 ` [RFC][Patch 2/2] " Amit K. Arora @ 2006-12-15 23:02 ` Andreas Dilger 2006-12-16 4:30 ` Amit K. Arora 2006-12-19 11:42 ` Amit K. Arora 1 sibling, 1 reply; 26+ messages in thread From: Andreas Dilger @ 2006-12-15 23:02 UTC (permalink / raw) To: Amit K. Arora; +Cc: linux-ext4, suparna, cmm, suzuki, alex On Dec 15, 2006 18:09 +0530, Amit K. Arora wrote: > This patch makes writing to the unitialized extent possible. A write operation on an unitialized extent *may* (depending on the relative block location in the extent and number of blocks being written) result in spliting the extent. There are three possibilities: > 1. The extent does not split : This will happen when the entire extent is being written to. In this case the extent will be marked "initialized" and merged (if possible) with the neighbouring extents in the tree. This should also be true if the write is at the beginning or the end of the uninitialized extent and the disk allocation matches the previous or next extent. The newly-written part is merged with the adjacent extent, and the uninitialized extent is shrunk appropriately. Doing this as a special case of #2 may result in extra tree rebalancing as the extra extent is added and removed repeatedly (consider the case of a large hole being overwritten in smaller chunks that is just at the limit of the number of extents in the parent block). > 2. The extent splits in two portions : This will happen when someone is writing to any one end of the extent (i.e. not in the middle, and not to the entire extent). This will result in breaking the extent in two portions, an initialized extent (the set of blocks being written to) and an uninitialized extent (rest of the blocks in the parent extent). > 3. The extent is split in three parts: This occurs when someone writes in the middle of the extent. It will result into three extents, two uninitialized (at the both ends) and one initialized (in middle). > > Since the extent merge logic was getting redundant, it has been put into a new function ext4_ext_try_to_merge(). This gets called from ext4_ext_insert_extent() and ext4_ext_get_blocks(), when required. Cheers, Andreas -- Andreas Dilger Principal Software Engineer Cluster File Systems, Inc. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-15 23:02 ` Andreas Dilger @ 2006-12-16 4:30 ` Amit K. Arora 0 siblings, 0 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-16 4:30 UTC (permalink / raw) To: Andreas Dilger; +Cc: linux-ext4, suparna, cmm, suzuki, alex On Fri, Dec 15, 2006 at 04:02:25PM -0700, Andreas Dilger wrote: > On Dec 15, 2006 18:09 +0530, Amit K. Arora wrote: > > This patch makes writing to the unitialized extent possible. A write operation on an unitialized extent *may* (depending on the relative block location in the extent and number of blocks being written) result in spliting the extent. There are three possibilities: > > 1. The extent does not split : This will happen when the entire extent is being written to. In this case the extent will be marked "initialized" and merged (if possible) with the neighbouring extents in the tree. > > This should also be true if the write is at the beginning or the end of the > uninitialized extent and the disk allocation matches the previous or next > extent. The newly-written part is merged with the adjacent extent, and the > uninitialized extent is shrunk appropriately. You are right. And the current patch takes care of that. If the write is at the begining of the uninitialized extent, the first extent (from the split) will be initialized ("ex2" in this case), and we do call try_to_merge() to merge this with the previous extent, if possible. This scenario can be seen as ex1 == NULL && ex2 == ex && ex3 != NULL (Please note that "ex" is the uninitialized extent, and "ex2" is _always_ the "initialized" extent being created, whether it is on left, right or middle of the "parent" uninitialized extent) If the initialized extent is the second one in the split (i.e. write is happening on the later part of the uninitialized extent), it will result in shirinking the existing uninitialized extent and "inserting" the new initialized extent. insert_extent() will be called in this case, which also tries to merge the extent with the neighbouring extents (both, towards left and right side). The following condition will hold true in this case: ex1 != NULL && ex2 != ex && ex3 == NULL > > Doing this as a special case of #2 may result in extra tree rebalancing as > the extra extent is added and removed repeatedly (consider the case of a > large hole being overwritten in smaller chunks that is just at the limit > of the number of extents in the parent block). Yes, as I mentioned, the case #2 already handles this. I guess, I should have been explicit about it in the description... > > > 2. The extent splits in two portions : This will happen when someone is writing to any one end of the extent (i.e. not in the middle, and not to the entire extent). This will result in breaking the extent in two portions, an initialized extent (the set of blocks being written to) and an uninitialized extent (rest of the blocks in the parent extent). > > 3. The extent is split in three parts: This occurs when someone writes in the middle of the extent. It will result into three extents, two uninitialized (at the both ends) and one initialized (in middle). > > > > Since the extent merge logic was getting redundant, it has been put into a new function ext4_ext_try_to_merge(). This gets called from ext4_ext_insert_extent() and ext4_ext_get_blocks(), when required. > > Cheers, Andreas > -- > Andreas Dilger > Principal Software Engineer > Cluster File Systems, Inc. Regards, ---- Amit Arora (aarora@in.ibm.com) Linux Technology Center IBM India ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-15 12:39 ` [RFC][Patch 2/2] " Amit K. Arora 2006-12-15 23:02 ` Andreas Dilger @ 2006-12-19 11:42 ` Amit K. Arora 2006-12-19 11:54 ` Amit K. Arora 2006-12-19 21:14 ` Andreas Dilger 1 sibling, 2 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-19 11:42 UTC (permalink / raw) To: linux-ext4; +Cc: suparna, cmm, suzuki, alex I wrote a simple tool to test these patches. The tool takes four arguments: * command: It may have either of the two values - "prealloc" or "write" * filename: This is the filename with relative path * offset: The offset within the file from where the preallocation, or the write should start. * length: Total number of bytes to be allocated/written from offset. Following cases were tested : 1. * preallocation from 0 to 32MB * write to various parts of the preallocated space in sets * observed that the extents get split and also get merged 2. * preallocate with holes at various places in the file * write to blocks starting from a hole and ending into preallocated blocks and vice-versa * try to write to entire set of blocks (i.e. from 0 to the last preallocated block) which has holes in between. I also tried some random preallocation and write operations. They seem to work fine. There is a patch also ready for e2fsprogs utils to recognize uninitialized extents, which I used to verify the results of the above testcases. I will post that patch in the next mail. Here is the code for the simple tool : #include<stdio.h> #include<stdlib.h> #include<fcntl.h> #include<errno.h> #define EXT4_IOC_FALLOCATE 0x40106609 struct ext4_falloc_input { unsigned long long offset; unsigned long long len; }; int do_prealloc(char* fname, struct ext4_falloc_input input) { int ret, fd = open(fname, O_CREAT|O_RDWR, 0666); if(fd<0) { printf("Error opening file %s\n", fname); return 1; } printf("%s : Trying to preallocate blocks (offset=%llu, len=%llu)\n", fname, input.offset, input.len); ret = ioctl(fd, EXT4_IOC_FALLOCATE, &input); if(ret <0) { printf("IOCTL: received error %d, ret=%d\n", errno, ret); close(fd); exit(1); } printf("IOCTL succedded ! ret=%d\n", ret); close(fd); } int do_write(char* fname, struct ext4_falloc_input input) { int ret, fd; char *buf; buf = (char *)malloc(input.len); fd = open(fname, O_CREAT|O_RDWR, 0666); if(fd<0) { printf("Error opening file %s\n", fname); return 1; } printf("%s : Trying to write to file (offset=%llu, len=%llu)\n", fname, input.offset, input.len); ret = lseek(fd, input.offset, SEEK_SET); if(ret != input.offset) { printf("lseek() failed error=%d, ret=%d\n", errno, ret); close(fd); return(1); } ret = write(fd, buf, input.len); if(ret != input.len) { printf("write() failed error=%d, ret=%d\n", errno, ret); close(fd); return(1); } printf("Write succedded ! Written %llu bytes ret=%d\n", input.len, ret); close(fd); } int main(int argc, char **argv) { struct ext4_falloc_input input; int ret = 1, fd; char *fname; if(argc<5) { printf("%s <CMD: prealloc/write> <filename-with-path> <offset> <length>\n", argv[0]); exit(1); } fname = argv[2]; input.offset=(unsigned long long)atol(argv[3]);; input.len=(unsigned long long)atol(argv[4]); if(input.offset<0 || input.len<= 0) { printf("%s: Invalid arguments.\n", argv[0]); exit(1); } if(!strcmp(argv[1], "prealloc")) ret = do_prealloc(fname, input); else if(!strcmp(argv[1], "write")) ret = do_write(fname, input); else printf("%s: Invalid arguments.\n", argv[0]); return ret; } ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-19 11:42 ` Amit K. Arora @ 2006-12-19 11:54 ` Amit K. Arora 2006-12-19 21:14 ` Andreas Dilger 1 sibling, 0 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-19 11:54 UTC (permalink / raw) To: linux-ext4; +Cc: suparna, cmm, suzuki, alex On Tue, Dec 19, 2006 at 05:12:51PM +0530, Amit K. Arora wrote: > I also tried some random preallocation and write operations. They seem > to work fine. There is a patch also ready for e2fsprogs utils to > recognize uninitialized extents, which I used to verify the results of > the above testcases. I will post that patch in the next mail. > This is a patch on top of e2fsprogs-1.39 + ext4 updates, which can be used to see extent details (like - actual size, whether initialized or uninitialized extent etc.) on an extent based file in ext4. debugfs tool can be used for this, after compiling e2fsprogs with this patch. This patch also enables EXT_DEBUG flag, so that the extent details get displayed. --- lib/ext2fs/bmap.c | 3 +- lib/ext2fs/ext4_extents.h | 12 +++++++++- lib/ext2fs/extents.c | 55 ++++++++++++++++++++++++++++------------------ 3 files changed, 47 insertions(+), 23 deletions(-) Index: e2fsprogs-1.39/lib/ext2fs/bmap.c =================================================================== --- e2fsprogs-1.39.orig/lib/ext2fs/bmap.c 2006-12-19 11:53:48.000000000 +0530 +++ e2fsprogs-1.39/lib/ext2fs/bmap.c 2006-12-19 11:53:52.000000000 +0530 @@ -45,7 +45,8 @@ ex = EXT_FIRST_EXTENT(eh); for (i = 0; i < eh->eh_entries; i++, ex++) { if ((ex->ee_block <= block) && - (block < ex->ee_block + ex->ee_len)) { + (block < ex->ee_block + + ext4_ext_get_actual_len(ex))) { *phys_blk = EXT4_EE_START(ex) + (block - ex->ee_block); return 0; Index: e2fsprogs-1.39/lib/ext2fs/ext4_extents.h =================================================================== --- e2fsprogs-1.39.orig/lib/ext2fs/ext4_extents.h 2006-12-19 11:53:48.000000000 +0530 +++ e2fsprogs-1.39/lib/ext2fs/ext4_extents.h 2006-12-19 15:55:32.000000000 +0530 @@ -37,7 +37,7 @@ * if EXT_DEBUG is defined you can use 'extdebug' mount option * to get lots of info what's going on */ -//#define EXT_DEBUG +#define EXT_DEBUG #ifdef EXT_DEBUG #define ext_debug(tree,fmt,a...) \ do { \ @@ -170,6 +170,16 @@ #define EXT_ASSERT(__x__) if (!(__x__)) BUG(); +/* + * Macro-instructions used to handle (mark/unmark/check/create) unitialized + * extents. Applications can issue an IOCTL for preallocation, which results + * in assigning unitialized extents to the file + */ +#define EXT4_CREATE_UNINITIALIZED_EXT 2 +#define ext4_ext_mark_uninitialized(ext) ((ext)->ee_len |= 0x8000) +#define ext4_ext_is_uninitialized(ext) ((ext)->ee_len & 0x8000) +#define ext4_ext_get_actual_len(ext) ((ext)->ee_len & 0x7FFF) + /* * this structure is used to gather extents from the tree via ioctl Index: e2fsprogs-1.39/lib/ext2fs/extents.c =================================================================== --- e2fsprogs-1.39.orig/lib/ext2fs/extents.c 2006-12-19 11:53:48.000000000 +0530 +++ e2fsprogs-1.39/lib/ext2fs/extents.c 2006-12-19 11:55:03.000000000 +0530 @@ -36,9 +36,11 @@ void show_extent(struct ext4_extent *ex) { - printf("extent: block=%u-%u len=%u start=%u start_hi=%u\n", - ex->ee_block, ex->ee_block + ex->ee_len - 1, - ex->ee_len, ex->ee_start, ex->ee_start_hi); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + printf("extent[%c]: block=%u-%u len=%u start=%u start_hi=%u\n", + ext4_ext_is_uninitialized(ex) ? 'u' : 'i', + ex->ee_block, ex->ee_block + ee_len - 1, ee_len, + ex->ee_start, ex->ee_start_hi); } #else #define show_header(eh) do { } while (0) @@ -75,7 +77,7 @@ if (EXT4_EE_START(ex) > EXT2_BLOCKS_COUNT(fs->super)) return EXT2_ET_EXTENT_LEAF_BAD; - if (ex->ee_len == 0) + if (ext4_ext_get_actual_len(ex) == 0) return EXT2_ET_EXTENT_LEAF_BAD; if (ex_prev) { @@ -84,13 +86,14 @@ return EXT2_ET_EXTENT_LEAF_BAD; /* extents must be in logical offset order */ - if (ex->ee_block < ex_prev->ee_block + ex_prev->ee_len) + if (ex->ee_block < ex_prev->ee_block + + ext4_ext_get_actual_len(ex_prev)) return EXT2_ET_EXTENT_LEAF_BAD; /* extents must not overlap physical blocks */ - if ((EXT4_EE_START(ex) < - EXT4_EE_START(ex_prev) + ex_prev->ee_len) && - (EXT4_EE_START(ex) + ex->ee_len > EXT4_EE_START(ex_prev))) + if ((EXT4_EE_START(ex) < EXT4_EE_START(ex_prev) + + ext4_ext_get_actual_len(ex_prev)) && (EXT4_EE_START(ex) + + ext4_ext_get_actual_len(ex) > EXT4_EE_START(ex_prev))) return EXT2_ET_EXTENT_LEAF_BAD; } @@ -98,7 +101,8 @@ if (ex->ee_block < ix->ei_block) return EXT2_ET_EXTENT_LEAF_BAD; - if (ix_len && ex->ee_block + ex->ee_len > ix->ei_block + ix_len) + if (ix_len && ex->ee_block + ext4_ext_get_actual_len(ex) > + ix->ei_block + ix_len) return EXT2_ET_EXTENT_LEAF_BAD; } @@ -144,6 +148,7 @@ { int entry = ex - EXT_FIRST_EXTENT(eh); struct ext4_extent *ex_new = ex + 1; + unsigned uninitialized=0; if (entry < 0 || entry > eh->eh_entries) return EXT2_ET_EXTENT_LEAF_BAD; @@ -151,18 +156,25 @@ if (eh->eh_entries >= eh->eh_max) return EXT2_ET_EXTENT_NO_SPACE; - if (count > ex->ee_len) + if (count > ext4_ext_get_actual_len(ex)) return EXT2_ET_EXTENT_LEAF_BAD; - if (count > ex->ee_len) + if (count > ext4_ext_get_actual_len(ex)) return EXT2_ET_EXTENT_LEAF_BAD; + if(ext4_ext_is_uninitialized(ex)) + uninitialized=1; + memmove(ex_new, ex, (eh->eh_entries - entry) * sizeof(*ex)); ++eh->eh_entries; ex->ee_len = count; ex_new->ee_len -= count; ex_new->ee_block += count; + if(uninitialized) { + ext4_ext_mark_uninitialized(ex); + ext4_ext_mark_uninitialized(ex_new); + } EXT4_EE_START_SET(ex_new, EXT4_EE_START(ex_new) + count); return 0; @@ -195,7 +207,7 @@ ex = EXT_FIRST_EXTENT(eh); for (i = 0; i < eh->eh_entries; i++, ex++) { show_extent(ex); - for (j = 0; j < ex->ee_len; j++) { + for (j = 0; j < ext4_ext_get_actual_len(ex); j++) { block_address = EXT4_EE_START(ex) + j; flags = (*ctx->func)(ctx->fs, &block_address, (ex->ee_block + j), @@ -216,15 +228,15 @@ #endif if (ex_prev && - block_address == - EXT4_EE_START(ex_prev) + ex_prev->ee_len && - ex->ee_block + j == - ex_prev->ee_block + ex_prev->ee_len) { + block_address == EXT4_EE_START(ex_prev) + + ext4_ext_get_actual_len(ex_prev) && + ex->ee_block + j == ex_prev->ee_block + + ext4_ext_get_actual_len(ex_prev)) { /* can merge block with prev extent */ ex_prev->ee_len++; ex->ee_len--; - if (ex->ee_len == 0) { + if (ext4_ext_get_actual_len(ex) == 0) { /* no blocks left in this one */ ext2fs_extent_remove(eh, ex); i--; ex--; @@ -238,7 +250,7 @@ } ret |= BLOCK_CHANGED; - } else if (ex->ee_len == 1) { + } else if (ext4_ext_get_actual_len(ex) == 1) { /* single-block extent is easy - * change extent directly */ EXT4_EE_START_SET(ex, block_address); @@ -250,7 +262,8 @@ ret |= BLOCK_ABORT | BLOCK_ERROR; return ret; - } else if (j > 0 && (ex + 1)->ee_len > 1 && + } else if (j > 0 && + ext4_ext_get_actual_len(ex + 1) > 1 && ext2fs_extent_split(eh, ex + 1, 1)) { /* split after new block failed */ /* No multi-level split yet */ @@ -258,7 +271,7 @@ return ret; } else if (j == 0) { - if (ex->ee_len != 1) { + if (ext4_ext_get_actual_len(ex) != 1) { /* this is an internal error */ ret |= BLOCK_ABORT |BLOCK_ERROR; return ret; @@ -269,7 +282,7 @@ } else { ex++; i++; - if (ex->ee_len != 1) { + if (ext4_ext_get_actual_len(ex) != 1) { /* this is an internal error */ ret |= BLOCK_ABORT |BLOCK_ERROR; return ret; ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-19 11:42 ` Amit K. Arora 2006-12-19 11:54 ` Amit K. Arora @ 2006-12-19 21:14 ` Andreas Dilger 2006-12-19 21:23 ` Eric Sandeen 2006-12-22 15:16 ` Amit K. Arora 1 sibling, 2 replies; 26+ messages in thread From: Andreas Dilger @ 2006-12-19 21:14 UTC (permalink / raw) To: Amit K. Arora; +Cc: linux-ext4, suparna, cmm, suzuki, alex On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote: > I wrote a simple tool to test these patches. The tool takes four > arguments: > > * command: It may have either of the two values - "prealloc" or "write" > * filename: This is the filename with relative path > * offset: The offset within the file from where the preallocation, or > the write should start. > * length: Total number of bytes to be allocated/written from offset. > > Following cases were tested : > 1. * preallocation from 0 to 32MB > * write to various parts of the preallocated space in sets > * observed that the extents get split and also get merged > > 2. * preallocate with holes at various places in the file > * write to blocks starting from a hole and ending into preallocated > blocks and vice-versa > * try to write to entire set of blocks (i.e. from 0 to the last > preallocated block) which has holes in between. An ideal test would be to modify fsx to (randomly) do preallocations instead of truncates that increase the size. Cheers, Andreas -- Andreas Dilger Principal Software Engineer Cluster File Systems, Inc. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-19 21:14 ` Andreas Dilger @ 2006-12-19 21:23 ` Eric Sandeen 2006-12-20 8:19 ` Amit K. Arora 2006-12-22 15:16 ` Amit K. Arora 1 sibling, 1 reply; 26+ messages in thread From: Eric Sandeen @ 2006-12-19 21:23 UTC (permalink / raw) To: Andreas Dilger; +Cc: Amit K. Arora, linux-ext4, suparna, cmm, suzuki, alex Andreas Dilger wrote: > On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote: >> I wrote a simple tool to test these patches. The tool takes four >> arguments: >> >> * command: It may have either of the two values - "prealloc" or "write" >> * filename: This is the filename with relative path >> * offset: The offset within the file from where the preallocation, or >> the write should start. >> * length: Total number of bytes to be allocated/written from offset. >> >> Following cases were tested : >> 1. * preallocation from 0 to 32MB >> * write to various parts of the preallocated space in sets >> * observed that the extents get split and also get merged >> >> 2. * preallocate with holes at various places in the file >> * write to blocks starting from a hole and ending into preallocated >> blocks and vice-versa >> * try to write to entire set of blocks (i.e. from 0 to the last >> preallocated block) which has holes in between. > > An ideal test would be to modify fsx to (randomly) do preallocations > instead of truncates that increase the size. the fsx in the xfs qa suite already does this (albeit with the special xfs calls, of course) But it might be worth looking at as a starting point, and also maybe to keep the options and behavior similar to one* version of fsx that is out there. :) http://oss.sgi.com/cgi-bin/cvsweb.cgi/xfs-cmds/xfstests/ltp/fsx.c?rev=1.7 -Eric *http://kernelslacker.livejournal.com/?skip=43 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-19 21:23 ` Eric Sandeen @ 2006-12-20 8:19 ` Amit K. Arora 0 siblings, 0 replies; 26+ messages in thread From: Amit K. Arora @ 2006-12-20 8:19 UTC (permalink / raw) To: Eric Sandeen; +Cc: linux-ext4, suparna, cmm, suzuki, alex On Tue, Dec 19, 2006 at 03:23:41PM -0600, Eric Sandeen wrote: > Andreas Dilger wrote: > > On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote: > >> I wrote a simple tool to test these patches. The tool takes four > >> arguments: > >> > >> * command: It may have either of the two values - "prealloc" or "write" > >> * filename: This is the filename with relative path > >> * offset: The offset within the file from where the preallocation, or > >> the write should start. > >> * length: Total number of bytes to be allocated/written from offset. > >> > >> Following cases were tested : > >> 1. * preallocation from 0 to 32MB > >> * write to various parts of the preallocated space in sets > >> * observed that the extents get split and also get merged > >> > >> 2. * preallocate with holes at various places in the file > >> * write to blocks starting from a hole and ending into preallocated > >> blocks and vice-versa > >> * try to write to entire set of blocks (i.e. from 0 to the last > >> preallocated block) which has holes in between. > > > > An ideal test would be to modify fsx to (randomly) do preallocations > > instead of truncates that increase the size. > > the fsx in the xfs qa suite already does this (albeit with the special > xfs calls, of course) Hmm... the fsx in xfs qa suite does preallocation, but _only_ at the start of the testcase - i.e. only once. I guess what Andreas suggested is to use random preallocations instead of random truncates. That will essentially mean to call "doprealloc(size)" (which will preallocate size number of bytes from start of the file), instead of dotruncate(size) in the "test(void)" function; when fsx is passed with a relevant option (say "-x"). Am I right ? BTW, I plan to modify the fsx-linux.c in LTP for this. Will update on how it goes... > > But it might be worth looking at as a starting point, and also maybe to > keep the options and behavior similar to one* version of fsx that is out > there. :) Yes, it will be nice to keep the behavior and the option same here. > > http://oss.sgi.com/cgi-bin/cvsweb.cgi/xfs-cmds/xfstests/ltp/fsx.c?rev=1.7 > > -Eric > > *http://kernelslacker.livejournal.com/?skip=43 ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-19 21:14 ` Andreas Dilger 2006-12-19 21:23 ` Eric Sandeen @ 2006-12-22 15:16 ` Amit K. Arora 2006-12-22 15:31 ` Alex Tomas 1 sibling, 1 reply; 26+ messages in thread From: Amit K. Arora @ 2006-12-22 15:16 UTC (permalink / raw) To: Andreas Dilger; +Cc: linux-ext4, suparna, cmm, suzuki, alex On Tue, Dec 19, 2006 at 02:14:09PM -0700, Andreas Dilger wrote: > On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote: > > I wrote a simple tool to test these patches. The tool takes four > > arguments: > > > > * command: It may have either of the two values - "prealloc" or "write" > > * filename: This is the filename with relative path > > * offset: The offset within the file from where the preallocation, or > > the write should start. > > * length: Total number of bytes to be allocated/written from offset. > > > > Following cases were tested : > > 1. * preallocation from 0 to 32MB > > * write to various parts of the preallocated space in sets > > * observed that the extents get split and also get merged > > > > 2. * preallocate with holes at various places in the file > > * write to blocks starting from a hole and ending into preallocated > > blocks and vice-versa > > * try to write to entire set of blocks (i.e. from 0 to the last > > preallocated block) which has holes in between. Hi Andreas, > An ideal test would be to modify fsx to (randomly) do preallocations > instead of truncates that increase the size. Thanks for the suggestion. The modified fsx (used "fsx-linux" in LTP as the base) did uncover couple of bugs. One has a straight forward fix of using ext4_ext_get_actual_len() in ext4_ext_put_gap_in_cache(). Without this change, it is resulting in a panic when a ftruncate() of size zero is done on an extent enabled file with preallocated blocks. The other problem is slightly more complicated and occurs in some specific conditions (filesize >= 512KB and number of operations >= 100000). When it happens, it results in duplicate entries for a few logical block numbers in the tree. It looks as if some kind of "leak" is happening while splitting/merging extents. Following is a typical example (this is an output from debugfs in e2fsprogs-1.39 utils, which has the patch supporting preallocation applied to it): debugfs 1.39 (29-May-2006) debugfs: stat testfile header: magic=f30a entries=1 max=4 depth=1 generation=0 index: block=0 leaf=1201 leaf_hi=0 unused=0 header: magic=f30a entries=16 max=84 depth=0 generation=0 extent[u]: block=0-25 len=26 start=645 start_hi=0 extent[i]: block=26-67 len=42 start=671 start_hi=0 extent[u]: block=68-502 len=435 start=713 start_hi=0 extent[i]: block=76-107 len=32 start=589 start_hi=0 extent[i]: block=116-131 len=16 start=629 start_hi=0 extent[i]: block=132-134 len=3 start=1257 start_hi=0 extent[i]: block=135-177 len=43 start=2154 start_hi=0 extent[i]: block=178-194 len=17 start=4206 start_hi=0 extent[i]: block=195-247 len=53 start=1148 start_hi=0 extent[i]: block=248-248 len=1 start=1202 start_hi=0 extent[i]: block=249-249 len=1 start=1249 start_hi=0 extent[i]: block=250-289 len=40 start=1203 start_hi=0 extent[i]: block=290-361 len=72 start=3161 start_hi=0 extent[i]: block=362-366 len=5 start=5222 start_hi=0 extent[i]: block=367-397 len=31 start=3238 start_hi=0 extent[i]: block=398-511 len=114 start=1351 start_hi=0 Inode: 11 Type: regular Mode: 0644 Flags: 0x80000 Generation: 3161384699 User: 0 Group: 0 Size: 524288 File ACL: 0 Directory ACL: 0 Links: 1 Blockcount: 1864 Fragment: Address: 0 Number: 0 Size: 0 ctime: 0x458bc897 -- Fri Dec 22 06:59:19 2006 atime: 0x458bc897 -- Fri Dec 22 06:59:19 2006 mtime: 0x458bc897 -- Fri Dec 22 06:59:19 2006 BLOCKS: (IND):1201, (0-502):645-1147, (76-107):589-620, (116-131):629-644, (132-134):1257-1259, (135-177):2154-2196, (178-194):4206-4222, (195-247):1148-1200, (248):1202, (249):1249, (250-289):1203-1242, (290-361):3161-3232, (362-366):5222-5226, (367-397):3238-3268, (398-511):1351-1464 TOTAL: 932 Above we can see that block numbers from 68 to 502 are each covered by more than one extent (besides couple of holes, which also might be part of the same problem). Note: A "u" in extent[u] donates that this extent is uninitialized, and thus was created as part of preallocation and noone has written to it. An "i" signifies that the extent is initialized. I am trying to solve this issue currently. Any suggestions are more than welcome.. :) Regards, Amit Arora > > Cheers, Andreas > -- > Andreas Dilger > Principal Software Engineer > Cluster File Systems, Inc. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][Patch 2/2] Persistent preallocation in ext4 2006-12-22 15:16 ` Amit K. Arora @ 2006-12-22 15:31 ` Alex Tomas 0 siblings, 0 replies; 26+ messages in thread From: Alex Tomas @ 2006-12-22 15:31 UTC (permalink / raw) To: Amit K. Arora; +Cc: Andreas Dilger, linux-ext4, suparna, cmm, suzuki, alex >>>>> Amit K Arora (AKA) writes: AKA> Above we can see that block numbers from 68 to 502 are each covered by AKA> more than one extent (besides couple of holes, which also might be part AKA> of the same problem). AKA> Note: A "u" in extent[u] donates that this extent is AKA> uninitialized, and thus was created as part of preallocation and noone AKA> has written to it. An "i" signifies that the extent is initialized. AKA> I am trying to solve this issue currently. Any suggestions are more than AKA> welcome.. :) I'd write a simple function that check given leaf for consistency (in this case you need to check that every subsequent extent doesn't overlap previous one) and add this check in few points (before and after change). this worked very well, especially if all extents fit single block. thanks, Alex ^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2007-01-09 9:05 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-12-05 13:43 [RFC][Patch 1/1] Persistent preallocation in ext4 Amit K. Arora
2006-12-06 5:58 ` Amit K. Arora
2006-12-12 1:28 ` Mingming Cao
2006-12-12 6:23 ` Amit K. Arora
2006-12-13 0:20 ` Mingming Cao
2006-12-13 10:01 ` Amit K. Arora
2006-12-13 13:36 ` Dave Kleikamp
2006-12-13 15:38 ` Suparna Bhattacharya
2006-12-13 15:54 ` Mingming Cao
2006-12-15 12:35 ` [RFC][Patch 1/2] " Amit K. Arora
2006-12-19 11:05 ` Amit K. Arora
[not found] ` <20061219211206.GO5937@schatzie.adilger.int>
2006-12-20 6:28 ` Amit K. Arora
2006-12-27 23:30 ` Mingming Cao
2007-01-02 11:04 ` Amit K. Arora
2007-01-02 22:47 ` Mingming Cao
2007-01-09 9:05 ` Amit K. Arora
2006-12-15 12:39 ` [RFC][Patch 2/2] " Amit K. Arora
2006-12-15 23:02 ` Andreas Dilger
2006-12-16 4:30 ` Amit K. Arora
2006-12-19 11:42 ` Amit K. Arora
2006-12-19 11:54 ` Amit K. Arora
2006-12-19 21:14 ` Andreas Dilger
2006-12-19 21:23 ` Eric Sandeen
2006-12-20 8:19 ` Amit K. Arora
2006-12-22 15:16 ` Amit K. Arora
2006-12-22 15:31 ` Alex Tomas
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).