* [RFC v2 1/9] ext4: Adding itree feature and inode flags
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 2/9] ext4: Allow sorting dx_map by inode as well Radek Pazdera
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit adds two new flags for the itree. A new read-only file
system feature flag:
EXT4_FEATURE_RO_COMPAT_ITREE 0x1000
And an inode flag to indicate that a directory has itree:
EXT4_ITREE_FL = 0x10000000
EXT4_INODE_ITREE = 29
Also a macro for checking these flags on a directory was added.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/ext4.h | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d1..9512702 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -388,6 +388,7 @@ struct flex_groups {
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
+#define EXT4_ITREE_FL 0x20000000 /* Directory has itree */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -445,6 +446,7 @@ enum {
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
+ EXT4_INODE_ITREE = 29, /* Directory has itree */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
@@ -489,6 +491,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
+ CHECK_FLAG_VALUE(ITREE);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -1481,6 +1484,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
* GDT_CSUM bits are mutually exclusive.
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+#define EXT4_FEATURE_RO_COMPAT_ITREE 0x1000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1530,7 +1534,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
- EXT4_FEATURE_RO_COMPAT_QUOTA)
+ EXT4_FEATURE_RO_COMPAT_QUOTA|\
+ EXT4_FEATURE_RO_COMPAT_ITREE)
/*
* Default values for user and/or group using reserved blocks
@@ -1688,6 +1693,11 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+#define dx_itree(dir) (is_dx(dir) && \
+ EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, \
+ EXT4_FEATURE_RO_COMPAT_ITREE) && \
+ ext4_test_inode_flag((dir), EXT4_INODE_ITREE))
+
/* Legal values for the dx_root hash_version field: */
#define DX_HASH_LEGACY 0
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 2/9] ext4: Allow sorting dx_map by inode as well
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
2013-05-13 15:42 ` [RFC v2 1/9] ext4: Adding itree feature and inode flags Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 3/9] ext4: Adding a link to itree to the dx_root struct Radek Pazdera
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit extends the code used by the dir index to sort its leaf
blocks by hash before splits. The implementation of itree needs to do
a similar sort when the tree is initialized. However, the order is
different.
The dx_sort_map() function now takes a pointer to a compare function
which determines the resulting ordering of the entries. At the moment,
there are two options avaiable -- hash_order() and inode_order().
Also a bit from the do_split() function was moved to a spearate helper
function called dx_map_split_point(). This will be used by the itree
code as well.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 77 ++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 54 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc3..6f73f81 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -226,6 +226,7 @@ struct dx_frame
struct dx_map_entry
{
+ u32 inode;
u32 hash;
u16 offs;
u16 size;
@@ -257,7 +258,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static void dx_sort_map(int (*cmp)(struct dx_map_entry*, struct dx_map_entry*),
+ struct dx_map_entry *map, unsigned count);
+static inline int hash_order(struct dx_map_entry *e1, struct dx_map_entry *e2);
+static inline int inode_order(struct dx_map_entry *e1, struct dx_map_entry *e2);
+static unsigned dx_map_split_point(struct dx_map_entry *map,
+ unsigned count, unsigned long blocksize);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
struct dx_map_entry *offsets, int count, unsigned blocksize);
static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
@@ -1072,8 +1078,9 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
- ext4fs_dirhash(de->name, de->name_len, &h);
map_tail--;
+ map_tail->inode = le32_to_cpu(de->inode);
+ ext4fs_dirhash(de->name, de->name_len, &h);
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
map_tail->size = le16_to_cpu(de->rec_len);
@@ -1086,8 +1093,21 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
return count;
}
-/* Sort map by hash value */
-static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+static inline int hash_order(struct dx_map_entry *e1, struct dx_map_entry *e2)
+{
+ return e1->hash < e2->hash;
+}
+
+static inline int inode_order(struct dx_map_entry *e1, struct dx_map_entry *e2)
+{
+ if (e1->inode == e2->inode)
+ return e1->hash < e2->hash;
+ return e1->inode < e2->inode;
+}
+
+/* Sort map. The order is determined by the comparison function (first arg) */
+static void dx_sort_map(int (*cmp)(struct dx_map_entry*, struct dx_map_entry*),
+ struct dx_map_entry *map, unsigned count)
{
struct dx_map_entry *p, *q, *top = map + count - 1;
int more;
@@ -1097,7 +1117,7 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count)
if (count - 9 < 2) /* 9, 10 -> 11 */
count = 11;
for (p = top, q = p - count; q >= map; p--, q--)
- if (p->hash < q->hash)
+ if (cmp(p, q))
swap(*p, *q);
}
/* Garden variety bubble sort */
@@ -1105,14 +1125,34 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count)
more = 0;
q = top;
while (q-- > map) {
- if (q[1].hash >= q[0].hash)
- continue;
- swap(*(q+1), *q);
- more = 1;
+ if (cmp(q + 1, q)) {
+ swap(*(q + 1), *q);
+ more = 1;
+ }
}
} while(more);
}
+/* Split the existing block in the middle, size-wise */
+static unsigned dx_map_split_point(struct dx_map_entry *map,
+ unsigned count, unsigned long blocksize)
+{
+ unsigned size, move;
+ int i;
+
+ size = 0;
+ move = 0;
+ for (i = count - 1; i >= 0; i--) {
+ /* is more than half of this entry in 2nd half of the block? */
+ if (size + map[i].size/2 > blocksize/2)
+ break;
+ size += map[i].size;
+ move++;
+ }
+
+ return count - move;
+}
+
static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
{
struct dx_entry *entries = frame->entries;
@@ -1532,11 +1572,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
u32 hash2;
struct dx_map_entry *map;
char *data1 = (*bh)->b_data, *data2;
- unsigned split, move, size;
+ unsigned split;
struct ext4_dir_entry_2 *de = NULL, *de2;
struct ext4_dir_entry_tail *t;
int csum_size = 0;
- int err = 0, i;
+ int err = 0;
if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1567,19 +1607,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
count = dx_make_map((struct ext4_dir_entry_2 *) data1,
blocksize, hinfo, map);
map -= count;
- dx_sort_map(map, count);
- /* Split the existing block in the middle, size-wise */
- size = 0;
- move = 0;
- for (i = count-1; i >= 0; i--) {
- /* is more than half of this entry in 2nd half of the block? */
- if (size + map[i].size/2 > blocksize/2)
- break;
- size += map[i].size;
- move++;
- }
+ dx_sort_map(hash_order, map, count);
+
/* map index at which we will split */
- split = count - move;
+ split = dx_map_split_point(map, count, blocksize);
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 3/9] ext4: Adding a link to itree to the dx_root struct
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
2013-05-13 15:42 ` [RFC v2 1/9] ext4: Adding itree feature and inode flags Radek Pazdera
2013-05-13 15:42 ` [RFC v2 2/9] ext4: Allow sorting dx_map by inode as well Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 4/9] ext4: Adding itree structures Radek Pazdera
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
The dx_tail struct that can be stored at the end of each root block was
extended with an additional link to the itree root block.
This commit renames the dx_tail to dx_csum_entry and adds dx_itree_entry
that holds the 64bit block pointer to itree root. The two structures are
independent of each other, so in case any of the features is disabled,
the correspoinding structure doesn't have to be in the tail. However,
a case where the corresponding flag is off and the structure still
resides in the tail is possible when the flag is turned off by tune2fs.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 204 ++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 162 insertions(+), 42 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6f73f81..20cf635 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -235,9 +235,30 @@ struct dx_map_entry
/*
* This goes at the end of each htree block.
*/
+struct dx_csum_entry {
+ u32 de_reserved;
+ __le32 de_checksum; /* crc32c(uuid+inum+dirblock) */
+};
+
+/*
+ * This goes at the end of a htree root block, if there is an itree
+ * available for that directory.
+ */
+struct dx_itree_entry {
+ __le64 de_itree_root;
+};
+
+/*
+ * This is a memory-only structure for easier handling the tail of
+ * dx_node. One or even both members can be set to NULL, which means
+ * that the node doesn't have the particular entry.
+ */
struct dx_tail {
- u32 dt_reserved;
- __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
+ void *start;
+ int len;
+
+ struct dx_csum_entry *csum;
+ struct dx_itree_entry *itree;
};
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
@@ -250,6 +271,10 @@ static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
+static int dx_get_itree_root(struct inode *inode, struct ext4_dir_entry *dirent,
+ ext4_fsblk_t *itree_root);
+static int dx_set_itree_root(struct inode *inode, struct ext4_dir_entry *dirent,
+ ext4_fsblk_t itree_root);
static struct dx_frame *dx_probe(const struct qstr *d_name,
struct inode *dir,
struct dx_hash_info *hinfo,
@@ -417,81 +442,136 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}
-static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
- int count_offset, int count, struct dx_tail *t)
+static int dx_get_tail(struct inode *inode, struct ext4_dir_entry *dirent,
+ struct dx_tail *tail)
+{
+ struct dx_countlimit *c;
+ int tail_space, limit, count_offset;
+ void *tail_ptr;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return -EIO;
+ }
+ limit = le16_to_cpu(c->limit);
+
+ memset(tail, 0, sizeof(struct dx_tail));
+ tail_ptr = tail->start = (void *)(((struct dx_entry *)c) + limit);
+ tail_space = EXT4_BLOCK_SIZE(inode->i_sb) -
+ (count_offset + (limit * sizeof(struct dx_entry)));
+
+ /* TODO This still requires some attention.
+ *
+ * We must handle cases where the tail structure is there,
+ * but the feature flag is off. This can happen when the
+ * feature is turned off using tune2fs.
+ *
+ * This means that, either the checksum part of the tail will
+ * be always added along with the itree one, even when the csum
+ * flag is off, or we need a way of differentiating between the
+ * two structures in case there is only one present.
+ *
+ * Also, in case both features are on and itree is turned off,
+ * we still need to use the structure while computing the checksum.
+ *
+ * Adding the pointer here will also require a patch for e2fsck.
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ tail_space >= sizeof(struct dx_csum_entry)) {
+ tail->len += sizeof(struct dx_csum_entry);
+ tail->csum = (struct dx_csum_entry *)tail_ptr;
+ tail_ptr += sizeof(struct dx_csum_entry);
+ tail_space -= sizeof(struct dx_csum_entry);
+ }
+
+ if (dx_itree(inode) && tail_space >= sizeof(struct dx_itree_entry)) {
+ tail->len += sizeof(struct dx_itree_entry);
+ tail->itree = (struct dx_itree_entry *)tail_ptr;
+ }
+
+ return 0;
+}
+
+static int ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+ struct dx_tail *tail, __le32 *csum)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- __u32 csum;
- __le32 save_csum;
- int size;
+ struct dx_countlimit *c;
+ int size, count, count_offset;
+ __u32 new_csum;
+ __le32 old_csum;
+
+ c = get_dx_countlimit(inode, dirent, &count_offset);
+ if (!c)
+ return -EIO;
+ count = le16_to_cpu(c->count);
size = count_offset + (count * sizeof(struct dx_entry));
- save_csum = t->dt_checksum;
- t->dt_checksum = 0;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
- t->dt_checksum = save_csum;
+ old_csum = tail->csum->de_checksum;
+ tail->csum->de_checksum = 0;
+ new_csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ new_csum = ext4_chksum(sbi, new_csum, (__u8 *)tail->start, tail->len);
+ tail->csum->de_checksum = old_csum;
- return cpu_to_le32(csum);
+ *csum = cpu_to_le32(new_csum);
+ return 0;
}
static int ext4_dx_csum_verify(struct inode *inode,
struct ext4_dir_entry *dirent)
{
- struct dx_countlimit *c;
- struct dx_tail *t;
- int count_offset, limit, count;
+ struct dx_tail tail;
+ int err;
+ __le32 csum;
if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
return 1;
- c = get_dx_countlimit(inode, dirent, &count_offset);
- if (!c) {
- EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ err = dx_get_tail(inode, dirent, &tail);
+ if (err)
+ return err;
+
+ if (!tail.csum) {
+ warn_no_space_for_csum(inode);
return 1;
}
- limit = le16_to_cpu(c->limit);
- count = le16_to_cpu(c->count);
- if (count_offset + (limit * sizeof(struct dx_entry)) >
- EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
- warn_no_space_for_csum(inode);
+
+ err = ext4_dx_csum(inode, dirent, &tail, &csum);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
return 1;
}
- t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
- if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
- count, t))
+ if (tail.csum->de_checksum != csum)
return 0;
return 1;
}
static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
{
- struct dx_countlimit *c;
- struct dx_tail *t;
- int count_offset, limit, count;
+ struct dx_tail tail;
+ int err;
if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
return;
- c = get_dx_countlimit(inode, dirent, &count_offset);
- if (!c) {
- EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ err = dx_get_tail(inode, dirent, &tail);
+ if (err)
return;
- }
- limit = le16_to_cpu(c->limit);
- count = le16_to_cpu(c->count);
- if (count_offset + (limit * sizeof(struct dx_entry)) >
- EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+
+ if (!tail.csum) {
warn_no_space_for_csum(inode);
return;
}
- t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
- t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+ err = ext4_dx_csum(inode, dirent, &tail, &(tail.csum->de_checksum));
+ if (err)
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
}
static inline int ext4_handle_dirty_dx_node(handle_t *handle,
@@ -564,7 +644,9 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
- entry_space -= sizeof(struct dx_tail);
+ entry_space -= sizeof(struct dx_csum_entry);
+ if (dx_itree(dir))
+ entry_space -= sizeof(struct dx_itree_entry);
return entry_space / sizeof(struct dx_entry);
}
@@ -574,10 +656,48 @@ static inline unsigned dx_node_limit(struct inode *dir)
if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
- entry_space -= sizeof(struct dx_tail);
+ entry_space -= sizeof(struct dx_csum_entry);
return entry_space / sizeof(struct dx_entry);
}
+static int dx_get_itree_root(struct inode *inode, struct ext4_dir_entry *dirent,
+ ext4_fsblk_t *itree_root)
+{
+ int err;
+ struct dx_tail tail;
+
+ err = dx_get_tail(inode, dirent, &tail);
+ if (err)
+ return err;
+
+ if (!tail.itree) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return -EIO;
+ }
+
+ *itree_root = le64_to_cpu(tail.itree->de_itree_root);
+ return 0;
+}
+
+static int dx_set_itree_root(struct inode *inode, struct ext4_dir_entry *dirent,
+ ext4_fsblk_t itree_root)
+{
+ int err;
+ struct dx_tail tail;
+
+ err = dx_get_tail(inode, dirent, &tail);
+ if (err)
+ return err;
+
+ if (!tail.itree) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return -EIO;
+ }
+
+ tail.itree->de_itree_root = cpu_to_le64(itree_root);
+ return 0;
+}
+
/*
* Debug
*/
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 4/9] ext4: Adding itree structures
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
` (2 preceding siblings ...)
2013-05-13 15:42 ` [RFC v2 3/9] ext4: Adding a link to itree to the dx_root struct Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 5/9] ext4: Adding itree implementation I - Core Radek Pazdera
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit adds 5 new structures that will be used in the itree code.
On-disk:
struct itree_entry Single entry in an itree_node.
struct itree_node A non-leaf node in the itree.
struct itree_leaf_head Comes first in every itree leaf block.
struct itree_leaf_entry A dirent in itree leaf.
Memory only:
struct itree_frame Represents a path through a tree to one of
its leaves.
struct itree_key For representing the key for itree, which
is an inode number and file name hash
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 20cf635..c9b6c91 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -261,6 +261,56 @@ struct dx_tail {
struct dx_itree_entry *itree;
};
+/*
+ * itree structures
+ */
+
+/* A single entry in the itree_node */
+struct itree_entry {
+ __le32 inode;
+ __le32 hash;
+ __le64 block;
+ __u8 fullness;
+ __u8 flags;
+};
+
+#define ITREE_NODE_FL_CONT 1
+
+/* A non-leaf node of the itree */
+struct itree_node {
+ __le16 count;
+ __le16 limit;
+ __le32 checksum;
+ __u8 indirect_levels;
+ struct itree_entry entries[0];
+};
+
+/* Comes first in every itree leaf block */
+struct itree_leaf_head {
+ __le32 checksum;
+ __le32 used_length;
+};
+
+/* A dirent with hash in an itree leaf block */
+#define ITREE_LE_HEAD_LEN 4
+struct itree_leaf_entry {
+ __le32 hash;
+ struct ext4_dir_entry_2 dirent;
+};
+
+/* For easier manipulation of the compound key */
+struct itree_key {
+ u32 inode;
+ u32 hash;
+};
+
+/* Represents a position within an itree_node block */
+struct itree_frame {
+ struct buffer_head *bh;
+ struct itree_entry *entries;
+ struct itree_entry *at;
+};
+
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 5/9] ext4: Adding itree implementation I - Core
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
` (3 preceding siblings ...)
2013-05-13 15:42 ` [RFC v2 4/9] ext4: Adding itree structures Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 6/9] ext4: Adding itree implementation II - Inserting Radek Pazdera
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit contains the basic code related to the new inode tree
including the common/helper functions, tree initialisation, and
a few functions for debugging.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 729 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 729 insertions(+)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c9b6c91..1f9ea5b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -311,6 +311,49 @@ struct itree_frame {
struct itree_entry *at;
};
+/*
+ * The depth of the tree is limited for convenience, so we can allocate
+ * itree_frames statically on the stack.
+ *
+ * With three levels, the tree can take around 12 millions of entries
+ * in the worst case scenario of 255 characters per file name and each
+ * node of the tree only half-full. In a much more likely case of 20B
+ * per name and 75% average fullness, the tree capacity is roughly 0.5
+ * billion of entries.
+ *
+ * In case those limits are exceeded, the capacity of the tree can be
+ * increased by incrementing this constant.
+ */
+#define ITREE_MAX_DEPTH 3
+
+static int itree_init(handle_t *handle, struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct ext4_dir_entry_2 *dirents,
+ struct dentry *entry, struct inode *inode,
+ ext4_fsblk_t *root_block);
+
+static int itree_next_frame(struct inode *dir, u32 ino,
+ struct itree_frame *frames,
+ struct itree_frame *frame_in,
+ int allways);
+
+#define itree_leaf_for_each(le, de, head, blocksize) \
+ for (le = ((void *)head + sizeof(struct itree_leaf_head)), \
+ de = &(le->dirent); \
+ le < (struct itree_leaf_entry *)((void *)head + blocksize); \
+ le = itree_next_leaf_entry(le, blocksize), de = &(le->dirent))
+
+#define ITREE_DEBUG__
+#ifdef ITREE_DEBUG
+#define itree_debug(command) command
+static void itree_show_node(struct buffer_head *bh);
+static void itree_show_leaf(struct inode *dir, struct buffer_head *bh,
+ ext4_fsblk_t *block);
+static void itree_show(struct inode *dir, ext4_fsblk_t block);
+#else
+#define itree_debug(command)
+#endif
+
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
@@ -3391,3 +3434,689 @@ const struct inode_operations ext4_special_inode_operations = {
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
};
+
+/*
+ * TODO Add BUFFER_TRACEs where they should be
+ */
+
+static unsigned get_itree_node_limit(int blocksize)
+{
+ unsigned space = blocksize - sizeof(struct itree_node);
+ return space / sizeof(struct itree_entry);
+}
+
+static void ext4_free_itree_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block)
+{
+ int flags = EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ ext4_free_blocks(handle, inode, NULL, block, 1, flags);
+}
+
+static struct itree_leaf_head *itree_leaf_get_head(struct buffer_head *leaf)
+{
+ return (struct itree_leaf_head *)leaf->b_data;
+}
+
+static struct itree_leaf_entry *itree_leaf_get_entries(struct buffer_head *leaf)
+{
+ void *entries = (void *)leaf->b_data + sizeof(struct itree_leaf_head);
+ return (struct itree_leaf_entry *)entries;
+}
+
+static void itree_entry_get_key(struct itree_entry *entry,
+ struct itree_key *key)
+{
+ key->inode = le32_to_cpu(entry->inode);
+ key->hash = le32_to_cpu(entry->hash);
+}
+
+static void itree_leaf_entry_get_key(struct itree_leaf_entry *entry,
+ struct itree_key *key)
+{
+ key->inode = le32_to_cpu(entry->dirent.inode);
+ key->hash = le32_to_cpu(entry->hash);
+}
+
+static int itree_key_compare(struct itree_key *one, struct itree_key *two)
+{
+ if (one->inode < two->inode)
+ return -1;
+ if (one->inode > two->inode)
+ return 1;
+
+ if (one->hash < two->hash)
+ return -1;
+ if (one->hash > two->hash)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Returned buffer is locked. The caller is expected to mark it
+ * uptodate and unlock it after it is initialized.
+ */
+static struct buffer_head *ext4_new_itree_block(handle_t *handle,
+ struct inode *inode,
+ ext4_fsblk_t *block, int *err)
+{
+ struct buffer_head *bh;
+ ext4_fsblk_t new_blk;
+
+ new_blk = ext4_new_meta_blocks(handle, inode,
+ ext4_inode_to_goal_block(inode),
+ EXT4_MB_HINT_METADATA, NULL, err);
+ if (!new_blk)
+ return NULL;
+
+ bh = sb_getblk(inode->i_sb, new_blk);
+ if (!bh) {
+ *err = -ENOMEM;
+ goto fail;
+ }
+
+ lock_buffer(bh);
+
+ *err = ext4_journal_get_create_access(handle, bh);
+ if (*err) {
+ unlock_buffer(bh);
+ brelse(bh);
+ goto fail;
+ }
+
+ *block = new_blk;
+ return bh;
+
+fail:
+ ext4_free_itree_block(handle, inode, *block);
+ return NULL;
+}
+
+static u8 itree_get_fullness(int value, int value_max)
+{
+ int fullness = ((value * 255) / value_max) + 1;
+ return fullness > 255 ? 255 : fullness;
+}
+
+static u8 itree_get_leaf_fullness(struct itree_leaf_head *head, int blocksize)
+{
+ int value = le16_to_cpu(head->used_length);
+ int max = blocksize - sizeof(struct itree_leaf_head);
+
+ return itree_get_fullness(value, max);
+}
+
+static u8 itree_get_node_fullness(struct itree_node *node)
+{
+ int value = le16_to_cpu(node->count);
+ int max = le16_to_cpu(node->limit);
+
+ return itree_get_fullness(value, max);
+}
+
+static int itree_update_fullness(handle_t *handle, struct inode *dir,
+ struct itree_frame *frame, u8 fullness)
+{
+ struct buffer_head *bh = frame->bh;
+ struct itree_entry *entry = frame->at;
+ int err;
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ return err;
+
+ entry->fullness = fullness;
+
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static struct itree_frame *itree_probe(struct itree_key *key, struct inode *dir,
+ ext4_fsblk_t itree_root_blk,
+ struct itree_frame *frame_in, int *err)
+{
+ struct itree_frame *frame = frame_in;
+ struct buffer_head *bh;
+ struct itree_node *node;
+ struct itree_entry *entries, *at, *p, *q, *m;
+ struct itree_key entry_key;
+ unsigned count, indirect;
+
+ bh = sb_bread(dir->i_sb, itree_root_blk);
+ if (!bh) {
+ *err = -EIO;
+ return NULL;
+ }
+
+ /* TODO Verify checksum */
+
+ node = (struct itree_node *)bh->b_data;
+ if (node->indirect_levels >= ITREE_MAX_DEPTH) {
+ ext4_warning(dir->i_sb, "Too many levels in itree.");
+ return NULL;
+ }
+
+ while (1) {
+ frame->bh = NULL;
+ node = (struct itree_node *)bh->b_data;
+ entries = node->entries;
+ count = le16_to_cpu(node->count);
+ indirect = node->indirect_levels;
+
+ if (key->inode < le32_to_cpu(entries[0].inode)) {
+ at = entries;
+ } else {
+ p = entries;
+ q = entries + count - 1;
+ while (p <= q) {
+ m = p + (q - p)/2;
+ itree_entry_get_key(m, &entry_key);
+ if (itree_key_compare(&entry_key, key) > 0)
+ q = m - 1;
+ else
+ p = m + 1;
+ }
+ at = p - 1;
+ }
+
+ while ((at->flags & ITREE_NODE_FL_CONT) && at > entries &&
+ le32_to_cpu(at->inode) == key->inode)
+ at--;
+
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+
+ if (!indirect--)
+ return frame;
+
+ bh = sb_bread(dir->i_sb, le64_to_cpu(at->block));
+ if (!bh)
+ goto fail;
+
+ frame++;
+ }
+
+ return frame;
+
+fail:
+ while (frame >= frame_in) {
+ brelse(frame->bh);
+ frame--;
+ }
+ return NULL;
+}
+
+/*
+ * frames must be an array of at least two
+ */
+static void itree_release_frames(struct itree_frame *frames)
+{
+ int n;
+
+ for (n = 0; n < ITREE_MAX_DEPTH; n++)
+ if (frames[n].bh)
+ brelse(frames[n].bh);
+}
+
+static unsigned itree_leaf_entry_len(unsigned rec_len)
+{
+ return rec_len + ITREE_LE_HEAD_LEN;
+}
+
+static unsigned itree_leaf_entry_min_len(struct itree_leaf_entry *le)
+{
+ return itree_leaf_entry_len(EXT4_DIR_REC_LEN(le->dirent.name_len));
+}
+
+static int itree_leaf_entry_get_len(struct itree_leaf_entry *le,
+ long int blocksize)
+{
+ return sizeof(((struct itree_leaf_entry *)0)->hash) +
+ ext4_rec_len_from_disk(le->dirent.rec_len, blocksize);
+}
+
+static struct itree_leaf_entry
+*itree_next_leaf_entry(struct itree_leaf_entry *le, long int blocksize)
+{
+ BUG_ON(!itree_leaf_entry_get_len(le, blocksize));
+
+ return (struct itree_leaf_entry *)((void *)le +
+ itree_leaf_entry_get_len(le, blocksize));
+}
+
+struct itree_leaf_map {
+ struct itree_leaf_head *head;
+
+ struct itree_leaf_entry *first;
+ struct itree_leaf_entry *last;
+ struct itree_leaf_entry *at;
+ struct itree_leaf_entry *free;
+
+ struct itree_leaf_entry *before_split;
+ struct itree_leaf_entry *split_at;
+
+ int used_length1;
+ int used_length2;
+};
+
+static void scan_sorted_buf(struct itree_key *key, struct dentry *dentry,
+ struct buffer_head *bh, int blocksize,
+ struct itree_leaf_map *leaf_map)
+{
+ struct itree_leaf_head *lh;
+ struct itree_leaf_entry *le, *prev = 0;
+ struct ext4_dir_entry_2 *de;
+ struct itree_key le_key;
+ int rlen, req_rlen, min_rlen, size = 0;
+ int bufsize, len;
+
+ memset(leaf_map, 0, sizeof(struct itree_leaf_map));
+
+ lh = leaf_map->head = itree_leaf_get_head(bh);
+
+ req_rlen = itree_leaf_entry_len(EXT4_DIR_REC_LEN(dentry->d_name.len));
+ bufsize = blocksize - sizeof(struct itree_leaf_head);
+
+ le = itree_leaf_get_entries(bh);
+ leaf_map->first = leaf_map->at = le;
+
+ itree_leaf_for_each(le, de, lh, blocksize) {
+ itree_leaf_entry_get_key(le, &le_key);
+
+ if (le_key.inode && itree_key_compare(&le_key, key) <= 0)
+ leaf_map->at = le;
+
+ /* Identify a split point in case we'll need to split */
+ if (!leaf_map->split_at) {
+ rlen = itree_leaf_entry_get_len(le, blocksize);
+ if (size + rlen/2 >= bufsize/2) {
+ leaf_map->before_split = leaf_map->last;
+ leaf_map->split_at = le;
+ }
+ size += rlen;
+ }
+
+ if (le_key.inode) {
+ len = itree_leaf_entry_min_len(le);
+ if (!leaf_map->split_at)
+ leaf_map->used_length1 += len;
+ else
+ leaf_map->used_length2 += len;
+ }
+
+ /* XXX Maybe search for the smallest available free space? */
+ if (!leaf_map->free) {
+ min_rlen = 0;
+ if (le_key.inode) /* inode != 0 */
+ min_rlen = EXT4_DIR_REC_LEN(de->name_len);
+
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ if ((rlen - min_rlen) >= req_rlen)
+ leaf_map->free = le;
+ }
+
+ prev = leaf_map->last;
+ leaf_map->last = le;
+ }
+
+ /*
+ * When adding at end of the block, we're probably adding a
+ * continuous sequence of inodes. Make the split at the end
+ * of the block.
+ */
+ if (leaf_map->at == leaf_map->last) {
+ leaf_map->before_split = prev;
+ leaf_map->split_at = leaf_map->last;
+
+ len = itree_leaf_entry_min_len(leaf_map->last);
+ if (leaf_map->used_length2)
+ leaf_map->used_length1 += leaf_map->used_length2 - len;
+ leaf_map->used_length2 = len;
+ }
+}
+
+static int itree_next_frame(struct inode *dir, u32 ino,
+ struct itree_frame *frames,
+ struct itree_frame *frame_in,
+ int always)
+{
+ int count, levels_crossed = 0;
+ struct itree_frame *frame = frame_in;
+ struct buffer_head *bh;
+ struct itree_node *node;
+ struct itree_entry *node_end, *at = NULL;
+
+ while (frame >= frames) {
+ bh = frame->bh;
+ node = (struct itree_node *)bh->b_data;
+ count = le16_to_cpu(node->count);
+ node_end = &(frame->entries[count]);
+
+ at = frame->at + 1;
+ if (at < node_end)
+ break;
+
+ if (frame == frames)
+ return 1;
+
+ levels_crossed++;
+ frame--;
+ }
+
+ if (!always && (at->inode > ino ||
+ !(at->flags & ITREE_NODE_FL_CONT)))
+ return 1;
+
+ frame->at = at;
+
+ while (levels_crossed--) {
+ bh = sb_bread(dir->i_sb, le64_to_cpu(frame->at->block));
+ if (!bh)
+ return -EIO;
+
+ /* TODO: Checksum */
+
+ frame++;
+ brelse(frame->bh);
+ frame->bh = bh;
+ node = (struct itree_node *)bh->b_data;
+ frame->entries = node->entries;
+ frame->at = node->entries;
+ }
+ return 0;
+}
+
+/*
+ * Arrange the dirents to the two new itree blocks in the order
+ * specified by the map. Returns 1 in case the split occured within
+ * a collision, otherwise 0.
+ */
+static int itree_arrange_dirents(struct ext4_dir_entry_2 *dirents,
+ struct dx_hash_info *hinfo,
+ void *leaf1, void *leaf2, int blocksize)
+{
+ struct dx_map_entry *map, *split_point;
+ struct ext4_dir_entry_2 *de;
+ struct itree_leaf_head *head1, *head2;
+ struct itree_leaf_entry *entry = NULL;
+ unsigned split, rec_len = 0;
+ void *from, *to, *buf_end;
+ void *data1, *data2;
+ int size1 = 0, size2 = 0, *size = &size1;
+ int count, retval;
+
+ head1 = (struct itree_leaf_head *)leaf1;
+ head2 = (struct itree_leaf_head *)leaf2;
+
+ data1 = leaf1 + sizeof(struct itree_leaf_head);
+ data2 = leaf2 + sizeof(struct itree_leaf_head);
+
+ map = (struct dx_map_entry *)(leaf2 + blocksize);
+ count = dx_make_map(dirents, blocksize, hinfo, map);
+ map -= count;
+ dx_sort_map(inode_order, map, count);
+
+ split = dx_map_split_point(map, count, blocksize);
+ split_point = map + split;
+
+ /* Did we split a collision? */
+ retval = (map[split - 1].inode == map[split].inode) &&
+ (map[split - 1].hash == map[split].hash);
+
+ from = (void *)dirents;
+ to = data1;
+ while (count--) {
+ entry = (struct itree_leaf_entry *)to;
+ entry->hash = cpu_to_le32(map->hash);
+
+ de = (struct ext4_dir_entry_2 *)(from + (map->offs<<2));
+ rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ memcpy(&(entry->dirent), de, rec_len);
+ entry->dirent.rec_len = ext4_rec_len_to_disk(rec_len,
+ blocksize);
+
+ /*FIXME sizeof(leaf_entry_head!!)*/
+ *size += (rec_len + sizeof(__u32));
+
+ if (++map == split_point) {
+ buf_end = leaf1 + blocksize;
+ rec_len = buf_end - (void *)(&entry->dirent);
+ entry->dirent.rec_len = ext4_rec_len_to_disk(rec_len,
+ blocksize);
+ to = data2;
+ size = &size2;
+ } else {
+ to = itree_next_leaf_entry(entry, blocksize);
+ }
+ }
+
+ head1->used_length = cpu_to_le16(size1);
+ head2->used_length = cpu_to_le16(size2);
+
+ buf_end = leaf2 + blocksize;
+ rec_len = buf_end - (void *)(&(entry->dirent));
+ entry->dirent.rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+
+ return retval;
+}
+
+static int itree_init(handle_t *handle, struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct ext4_dir_entry_2 *dirents,
+ struct dentry *dentry, struct inode *inode,
+ ext4_fsblk_t *root_block)
+{
+ struct buffer_head *bh, *bh1, *bh2, *target_bh;
+ struct itree_leaf_head *head1, *head2;
+ ext4_fsblk_t leaf_blk1, leaf_blk2;
+ struct itree_node *root;
+ struct itree_key key2, key;
+ char *leaf1, *leaf2;
+ int err, blocksize, continued = 0;
+ struct itree_leaf_entry *le;
+ struct itree_leaf_map leaf_map;
+
+ blocksize = dir->i_sb->s_blocksize;
+
+ bh = ext4_new_itree_block(handle, dir, root_block, &err);
+ if (!bh)
+ goto out;
+
+ root = (struct itree_node *)(bh->b_data);
+ root->checksum = 0;
+ root->indirect_levels = 0;
+ root->count = 0;
+ root->limit = cpu_to_le16(get_itree_node_limit(blocksize));
+
+ /* Initialize the two dirent leaf blocks. */
+ bh1 = ext4_new_itree_block(handle, dir, &leaf_blk1, &err);
+ if (!bh1)
+ goto free_bh;
+ head1 = (struct itree_leaf_head *)bh1->b_data;
+ leaf1 = bh1->b_data;
+
+ bh2 = ext4_new_itree_block(handle, dir, &leaf_blk2, &err);
+ if (!bh2)
+ goto free_bh1;
+ head2 = (struct itree_leaf_head *)bh2->b_data;
+ leaf2 = bh2->b_data;
+
+ continued = itree_arrange_dirents(dirents, hinfo, leaf1, leaf2,
+ blocksize);
+
+ le = itree_leaf_get_entries(bh2);
+ itree_leaf_entry_get_key(le, &key2);
+
+ root->count = cpu_to_le16(2);
+ root->entries[0].inode = 0;
+ root->entries[0].hash = 0;
+ root->entries[0].block = cpu_to_le64(leaf_blk1);
+ root->entries[0].fullness = itree_get_leaf_fullness(head1, blocksize);
+ root->entries[0].flags = 0;
+
+ root->entries[1].inode = cpu_to_le32(key2.inode);
+ root->entries[1].hash = cpu_to_le32(key2.hash);
+ root->entries[1].block = cpu_to_le64(leaf_blk2);
+ root->entries[1].fullness = itree_get_leaf_fullness(head2, blocksize);
+ root->entries[1].flags = 0;
+
+ if (continued)
+ root->entries[1].flags |= ITREE_NODE_FL_CONT;
+
+ ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+ key.inode = inode->i_ino;
+ key.hash = hinfo->hash;
+
+ /* Insert the new entry to itree */
+ target_bh = itree_key_compare(&key, &key2) <= 0 ? bh1 : bh2;
+
+ scan_sorted_buf(&key, dentry, target_bh, blocksize, &leaf_map);
+ put_entry_to_sorted_buf(&key, dentry, inode, target_bh, blocksize,
+ &leaf_map);
+
+ /* TODO: Set checksums */
+
+ set_buffer_uptodate(bh2);
+ unlock_buffer(bh2);
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ goto free_bh1;
+ brelse(bh2);
+
+ set_buffer_uptodate(bh1);
+ unlock_buffer(bh1);
+ err = ext4_handle_dirty_metadata(handle, dir, bh1);
+ if (err)
+ goto free_bh;
+ brelse(bh1);
+
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+
+ return 0;
+
+/*free_bh2:
+ unlock_buffer(bh2);
+ brelse(bh2);
+ ext4_free_itree_block(handle, dir, leaf_blk2);*/
+free_bh1:
+ unlock_buffer(bh1);
+ brelse(bh1);
+ ext4_free_itree_block(handle, dir, leaf_blk1);
+free_bh:
+ unlock_buffer(bh);
+ brelse(bh);
+ ext4_free_itree_block(handle, dir, *root_block);
+out:
+ return err;
+}
+
+#ifdef ITREE_DEBUG
+static void itree_show_node(struct buffer_head *bh)
+{
+ struct itree_node *node = (struct itree_node *)bh->b_data;
+ struct itree_entry *e;
+ int n = 0, i;
+
+ static const char * const indent[] = {" ", " ",
+ " "};
+ i = node->indirect_levels;
+ BUG_ON(i < 0 || i > 2);
+
+ printk(KERN_DEBUG
+ "%s[itree node] count=%u, limit=%u, indirect=%u, checksum=%u\n",
+ indent[i], le16_to_cpu(node->count), le16_to_cpu(node->limit),
+ node->indirect_levels, le32_to_cpu(node->checksum));
+
+ for (e = node->entries; e < (node->entries + node->count); e++)
+ printk(KERN_DEBUG "%s [%d] inode=%u, hash=%u, "
+ "block=%llu, flags=%x, fullness=%u\n", indent[i], n++,
+ le32_to_cpu(e->inode), le32_to_cpu(e->hash),
+ le64_to_cpu(e->block), e->flags, e->fullness);
+}
+
+static void itree_show_leaf(struct inode *dir, struct buffer_head *bh,
+ ext4_fsblk_t *block)
+{
+ int blocksize = dir->i_sb->s_blocksize;
+ struct ext4_dir_entry_2 *de;
+ struct itree_leaf_entry *le;
+ struct itree_leaf_head *lh;
+ int num_entries = 0;
+ int size = 0, req_size = 0;
+ u32 first_inode, last_inode = 0;
+
+ lh = (struct itree_leaf_head *)bh->b_data;
+ le = (struct itree_leaf_entry *)(lh + 1);
+ first_inode = le32_to_cpu(le->dirent.inode);
+
+ itree_leaf_for_each(le, de, lh, blocksize) {
+ size += itree_leaf_entry_get_len(le, blocksize);
+ req_size += itree_leaf_entry_min_len(le);
+
+ num_entries++;
+ last_inode = le32_to_cpu(de->inode);
+ }
+
+ if (block)
+ printk(KERN_DEBUG " [itree leaf@%llu] ", *block);
+ else
+ printk(KERN_DEBUG " [itree leaf] ");
+ printk(KERN_DEBUG "checksum(%u), first_ino(%u), last_ino(%u), "
+ "entries(%u), fullness(%d%%), used_length(%u)\n",
+ le32_to_cpu(lh->checksum), first_inode, last_inode, num_entries,
+ (req_size*100)/size, le16_to_cpu(lh->used_length));
+
+ if (0) { /* Print entries */
+ printk(KERN_DEBUG " entries: ");
+ itree_leaf_for_each(le, de, lh, blocksize) {
+ le = container_of(de, struct itree_leaf_entry, dirent);
+ printk(KERN_DEBUG "%u:%u(%u), ",
+ le32_to_cpu(de->inode), le32_to_cpu(le->hash),
+ ext4_rec_len_from_disk(de->rec_len, blocksize));
+ }
+ printk(KERN_DEBUG "\n");
+ }
+}
+
+static void itree_show(struct inode *dir, ext4_fsblk_t block)
+{
+ struct buffer_head *bh, *leaf;
+ struct itree_node *node;
+ struct itree_entry *e;
+ int level;
+ ext4_fsblk_t next;
+
+ bh = sb_bread(dir->i_sb, block);
+ if (!bh)
+ return;
+
+ itree_show_node(bh);
+
+ node = (struct itree_node *)bh->b_data;
+ level = le16_to_cpu(node->indirect_levels);
+
+ for (e = node->entries; e < (node->entries + node->count); e++) {
+ next = le64_to_cpu(e->block);
+ if (level) {
+ itree_show(dir, next);
+ } else {
+ leaf = sb_bread(dir->i_sb, next);
+ if (!leaf)
+ return;
+ itree_show_leaf(dir, leaf, &next);
+ brelse(leaf);
+ }
+
+ }
+ brelse(bh);
+}
+#endif
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 6/9] ext4: Adding itree implementation II - Inserting
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
` (4 preceding siblings ...)
2013-05-13 15:42 ` [RFC v2 5/9] ext4: Adding itree implementation I - Core Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 7/9] ext4: Adding itree implementation III - Deleting Radek Pazdera
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit contains functions that are related to inserting entries
to the inode tree including the functions that handle the node splits.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 617 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 617 insertions(+)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1f9ea5b..af350af 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -332,6 +332,10 @@ static int itree_init(handle_t *handle, struct inode *dir,
struct dentry *entry, struct inode *inode,
ext4_fsblk_t *root_block);
+static int itree_add_entry(handle_t *handle, struct inode *dir,
+ ext4_fsblk_t root_blk, struct dentry *entry,
+ struct inode *inode, u32 hash);
+
static int itree_next_frame(struct inode *dir, u32 ino,
struct itree_frame *frames,
struct itree_frame *frame_in,
@@ -3688,6 +3692,68 @@ static struct itree_leaf_entry
itree_leaf_entry_get_len(le, blocksize));
}
+static struct itree_leaf_entry *make_space_before(struct itree_leaf_entry *le,
+ int blocksize)
+{
+ int min_len = itree_leaf_entry_min_len(le);
+ int rec_len = EXT4_DIR_REC_LEN(le->dirent.name_len);
+ int len = itree_leaf_entry_get_len(le, blocksize);
+ int new_rlen;
+
+ le->dirent.rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+ memmove((void *)le + len - min_len, le, min_len);
+
+ new_rlen = len - min_len - ITREE_LE_HEAD_LEN;
+ le->dirent.rec_len = ext4_rec_len_to_disk(new_rlen, blocksize);
+
+ return le;
+}
+
+static struct itree_leaf_entry *make_space_after(struct itree_leaf_entry *le,
+ int blocksize)
+{
+ struct itree_leaf_entry *new;
+ int len = itree_leaf_entry_get_len(le, blocksize);
+ int min_len = itree_leaf_entry_min_len(le);
+ int rec_len = EXT4_DIR_REC_LEN(le->dirent.name_len);
+ int new_rlen;
+
+ new = (struct itree_leaf_entry *)((void *)le + min_len);
+ le->dirent.rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+
+ new_rlen = len - min_len - ITREE_LE_HEAD_LEN;
+ new->dirent.rec_len = ext4_rec_len_to_disk(new_rlen, blocksize);
+ return new;
+}
+
+static int itree_insert_dentry(struct itree_key *key, struct dentry *dentry,
+ struct inode *inode, struct itree_leaf_entry *le,
+ int blocksize)
+{
+ struct itree_leaf_entry *new;
+ struct itree_key old_key;
+
+ itree_leaf_entry_get_key(le, &old_key);
+
+ new = le;
+ if (le->dirent.inode) {
+ if (itree_key_compare(key, &old_key) < 0)
+ new = make_space_before(le, blocksize);
+ else
+ new = make_space_after(le, blocksize);
+ }
+
+ new->dirent.file_type = EXT4_FT_UNKNOWN;
+ new->dirent.inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(inode->i_sb, &(new->dirent), inode->i_mode);
+ new->dirent.name_len = dentry->d_name.len;
+ memcpy(new->dirent.name, dentry->d_name.name, dentry->d_name.len);
+
+ new->hash = cpu_to_le32(key->hash);
+
+ return itree_leaf_entry_min_len(new);
+}
+
struct itree_leaf_map {
struct itree_leaf_head *head;
@@ -3703,6 +3769,73 @@ struct itree_leaf_map {
int used_length2;
};
+static int put_entry_to_sorted_buf(struct itree_key *key,
+ struct dentry *dentry, struct inode *inode,
+ struct buffer_head *bh, int blocksize,
+ struct itree_leaf_map *leaf_map)
+{
+ void *at_end, *from, *to;
+ int rlen, req_rlen, at_rec_len, free_rec_len, free_space;
+ int at_len, free_len;
+ struct itree_leaf_entry *at, *free;
+ struct itree_leaf_head *head = itree_leaf_get_head(bh);
+
+ at = leaf_map->at;
+ free = leaf_map->free;
+
+ at_rec_len = ext4_rec_len_from_disk(at->dirent.rec_len, blocksize);
+ free_rec_len = ext4_rec_len_from_disk(free->dirent.rec_len, blocksize);
+
+ at_len = itree_leaf_entry_get_len(at, blocksize);
+ free_len = itree_leaf_entry_get_len(free, blocksize);
+
+ at_end = (void *)at + at_len;
+
+ to = (void *)free + free_len;
+ from = (void *)free;
+ if (free->dirent.inode)
+ from += itree_leaf_entry_min_len(free);
+
+ req_rlen = itree_leaf_entry_len(EXT4_DIR_REC_LEN(dentry->d_name.len));
+ head->used_length = cpu_to_le16(le16_to_cpu(head->used_length) +
+ req_rlen);
+
+ /*
+ * Don't copy anything if there is enough space at the
+ * right spot
+ */
+ free_space = at_len;
+ if (at->dirent.inode)
+ free_space -= itree_leaf_entry_min_len(at);
+ if (free_space >= req_rlen)
+ return itree_insert_dentry(key, dentry, inode, at, blocksize);
+
+ /* In case there is an unused entry directly following 'at' */
+ if (at_end == from)
+ return itree_insert_dentry(key, dentry, inode, at, blocksize);
+
+ /* Fix free rec_len */
+ rlen = free_rec_len;
+ if (le32_to_cpu(free->dirent.inode))
+ free->dirent.rec_len = ext4_rec_len_to_disk(rlen - (to - from),
+ blocksize);
+
+ /* Which part of memory we'll need to move */
+ if (at_end > to) {
+ memmove(from, to, at_end - to);
+ at = (struct itree_leaf_entry *)((void *)at - (to - from));
+ } else {
+ memmove(at_end + (to - from), at_end, from - at_end);
+ }
+
+ /* Fix at rec_len */
+ rlen = at_rec_len;
+ at->dirent.rec_len = ext4_rec_len_to_disk(rlen + (to - from),
+ blocksize);
+
+ return itree_insert_dentry(key, dentry, inode, at, blocksize);
+}
+
static void scan_sorted_buf(struct itree_key *key, struct dentry *dentry,
struct buffer_head *bh, int blocksize,
struct itree_leaf_map *leaf_map)
@@ -3779,6 +3912,490 @@ static void scan_sorted_buf(struct itree_key *key, struct dentry *dentry,
}
}
+/*
+ * Used during node splits to test whether a split occured
+ * in the middle of a key collision.
+ */
+static int itree_is_continuation(struct itree_entry *a, struct itree_entry *b)
+{
+ return (a->inode == b->inode && a->hash == b->hash) ||
+ b->flags & ITREE_NODE_FL_CONT;
+}
+
+/*
+ * Returns non-zero if the node can be split, zero otherwise
+ */
+static int itree_can_split(struct itree_frame *frame,
+ struct itree_frame *frames)
+{
+ int count, limit;
+ struct itree_node *node;
+
+ while (frame >= frames) {
+ node = (struct itree_node *)frame->bh->b_data;
+ count = le16_to_cpu(node->count);
+ limit = le16_to_cpu(node->limit);
+
+ if (count < limit)
+ return 1;
+ frame--;
+ }
+ node = (struct itree_node *)frames->bh->b_data;
+ return node->indirect_levels < (ITREE_MAX_DEPTH - 1);
+}
+
+static struct itree_entry *itree_node_make_space(struct itree_node *node,
+ struct itree_entry *old)
+{
+ struct itree_entry *new = old + 1;
+ int len, count;
+
+ count = le16_to_cpu(node->count);
+ len = (void *)(node->entries + count) - (void *)new;
+ memmove(new + 1, new, len);
+ node->count = cpu_to_le16(++count);
+ return new;
+}
+
+static void itree_store_entry(struct itree_entry *new, struct itree_key *key,
+ ext4_fsblk_t block, int continuation)
+{
+ new->inode = cpu_to_le32(key->inode);
+ new->hash = cpu_to_le32(key->hash);
+ new->block = cpu_to_le64(block);
+
+ new->flags = 0;
+ if (continuation)
+ new->flags |= ITREE_NODE_FL_CONT;
+}
+
+static struct buffer_head *itree_node_split(handle_t *handle, struct inode *dir,
+ struct itree_frame *frame,
+ struct itree_entry **old,
+ struct itree_entry **new,
+ int *err)
+{
+ struct buffer_head *new_bh, *bh = frame->bh;
+ struct itree_node *node = (struct itree_node *)bh->b_data, *new_node;
+ int blocksize = dir->i_sb->s_blocksize;
+ int split, count, at;
+ ext4_fsblk_t new_blk;
+
+ BUFFER_TRACE(bh, "get_write_access");
+ *err = ext4_journal_get_write_access(handle, bh);
+ if (*err)
+ return NULL;
+
+ new_bh = ext4_new_itree_block(handle, dir, &new_blk, err);
+ if (!new_bh)
+ return NULL;
+
+ new_node = (struct itree_node *)new_bh->b_data;
+ new_node->indirect_levels = node->indirect_levels;
+ new_node->limit = cpu_to_le16(get_itree_node_limit(blocksize));
+
+ count = le16_to_cpu(node->count);
+
+ /* Don't split, just append if adding to the end */
+ if (frame->at == (node->entries + count - 1)) {
+ new_node->count = cpu_to_le16(1);
+ *old = node->entries + count - 1;
+ *new = new_node->entries;
+ return new_bh;
+ }
+
+ /* Can't split a block with a single entry */
+ BUG_ON(count <= 1);
+
+ split = count / 2;
+ memcpy(new_node->entries, node->entries + split,
+ (count - split) * sizeof(struct itree_entry));
+ node->count = le16_to_cpu(split);
+ new_node->count = le16_to_cpu(count - split);
+
+ at = frame->at - frame->entries;
+ if (at >= split) {
+ *old = new_node->entries + at - split;
+ *new = itree_node_make_space(new_node, *old);
+ } else {
+ *old = frame->at;
+ *new = itree_node_make_space(node, *old);
+ }
+
+ return new_bh;
+}
+
+static int itree_add_level(handle_t *handle, struct inode *dir,
+ struct itree_frame *frame, struct itree_key *key,
+ ext4_fsblk_t block, int continuation, int len1,
+ int len2)
+{
+ struct buffer_head *bh1, *bh2;
+ struct itree_entry *old, *new;
+ ext4_fsblk_t new_blk1, new_blk2;
+ struct itree_node *root, *node1, *node2;
+ struct itree_key key1, key2;
+ int limit, count, size, err;
+
+ bh1 = ext4_new_itree_block(handle, dir, &new_blk1, &err);
+ if (!bh1)
+ return err;
+
+ bh2 = itree_node_split(handle, dir, frame, &old, &new, &err);
+ if (!bh2) {
+ unlock_buffer(bh1);
+ brelse(bh1);
+ ext4_free_itree_block(handle, dir, new_blk1);
+ return err;
+ }
+ new_blk2 = bh2->b_blocknr;
+
+ itree_store_entry(new, key, block, continuation);
+
+ root = (struct itree_node *)frame->bh->b_data;
+ count = le16_to_cpu(root->count);
+ limit = le16_to_cpu(root->limit);
+
+ old->fullness = itree_get_fullness(len1, limit);
+ new->fullness = itree_get_fullness(len2, limit);
+
+ size = sizeof(struct itree_node) + count * sizeof(struct itree_entry);
+ memcpy(bh1->b_data, root, size);
+
+ node1 = (struct itree_node *)bh1->b_data;
+ node2 = (struct itree_node *)bh2->b_data;
+
+ continuation = itree_is_continuation(node1->entries + count - 1,
+ node2->entries);
+
+ itree_entry_get_key(node1->entries, &key1);
+ itree_entry_get_key(node2->entries, &key2);
+
+ len1 = le16_to_cpu(node1->count);
+ len2 = le16_to_cpu(node2->count);
+
+ root->indirect_levels++;
+ root->count = cpu_to_le16(2);
+
+ itree_store_entry(root->entries, &key1, new_blk1, 0);
+ root->entries->fullness = itree_get_fullness(len1, limit);
+
+ itree_store_entry(root->entries + 1, &key2, new_blk2, continuation);
+ root->entries[1].fullness = itree_get_fullness(len2, limit);
+
+ set_buffer_uptodate(bh1);
+ unlock_buffer(bh1);
+ err = ext4_handle_dirty_metadata(handle, dir, bh1);
+ if (err)
+ return err; /* FIXME Free everything? */
+ brelse(bh1);
+
+ set_buffer_uptodate(bh2);
+ unlock_buffer(bh2);
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ return err; /* FIXME Free everything? */
+ brelse(bh2);
+
+ err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ if (err)
+ return err; /* FIXME Free everything? */
+
+ return 0;
+}
+
+static int itree_node_insert_entry(handle_t *handle, struct inode *dir,
+ struct itree_key *key_in, ext4_fsblk_t block,
+ int continuation, struct itree_frame *frames,
+ struct itree_frame *frame,
+ int len1, int len2)
+{
+ struct buffer_head *bh1, *bh2;
+ struct itree_node *node, *node1, *node2;
+ struct itree_entry *old, *new;
+ struct itree_key key = *key_in;
+ int count, limit, err, max, levels, fullness;
+ int blocksize = dir->i_sb->s_blocksize;
+
+ while (frame >= frames) {
+ old = frame->at;
+ new = old + 1;
+ node = (struct itree_node *)frame->bh->b_data;
+ count = le16_to_cpu(node->count);
+ limit = le16_to_cpu(node->limit);
+ levels = node->indirect_levels;
+
+ if (levels)
+ max = le16_to_cpu(node->limit);
+ else
+ max = blocksize - sizeof(struct itree_leaf_head);
+
+ /* No need for split */
+ if (count < limit) {
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ return err;
+
+ new = itree_node_make_space(node, old);
+ itree_store_entry(new, &key, block, continuation);
+ old->fullness = itree_get_fullness(len1, max);
+ new->fullness = itree_get_fullness(len2, max);
+
+ err = ext4_handle_dirty_metadata(handle, dir,
+ frame->bh);
+ if (err)
+ return err;
+
+ if (frame - 1 >= frames) {
+ fullness = itree_get_node_fullness(node);
+ err = itree_update_fullness(handle, dir,
+ frame-1, fullness);
+ }
+
+ return err;
+ }
+
+ if (frame > frames) {
+ bh1 = frame->bh;
+ bh2 = itree_node_split(handle, dir, frame, &old, &new,
+ &err);
+ if (!bh2)
+ return err;
+
+ itree_store_entry(new, &key, block, continuation);
+ old->fullness = itree_get_fullness(len1, max);
+ new->fullness = itree_get_fullness(len2, max);
+
+ node1 = (struct itree_node *)bh1->b_data;
+ node2 = (struct itree_node *)bh2->b_data;
+
+ /* Values to add to the parent */
+ len1 = le16_to_cpu(node1->count);
+ len2 = le16_to_cpu(node2->count);
+
+ continuation = itree_is_continuation(node1->entries +
+ len1 - 1,
+ node2->entries);
+
+ itree_entry_get_key(node2->entries, &key);
+ block = bh2->b_blocknr;
+
+ set_buffer_uptodate(bh2);
+ unlock_buffer(bh2);
+
+ err = ext4_handle_dirty_metadata(handle, dir, bh1);
+ if (err) {
+ ext4_std_error(dir->i_sb, err);
+ return err;
+ }
+
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err) {
+ ext4_std_error(dir->i_sb, err);
+ return err;
+ }
+ brelse(bh2);
+ } else if (frame == frames && levels < (ITREE_MAX_DEPTH - 1)) {
+ return itree_add_level(handle, dir, frame, &key, block,
+ continuation, len1, len2);
+ } else {
+ return -ENOSPC;
+ }
+
+ frame--;
+ }
+ return 0;
+}
+
+static struct buffer_head *
+itree_find_leaf_entry(struct inode *dir, struct itree_key *key,
+ struct dentry *dentry, struct itree_frame *frames,
+ struct itree_frame *frame,
+ struct itree_leaf_map *lm, int *err)
+{
+ int retval, blocksize = dir->i_sb->s_blocksize;
+ struct buffer_head *bh;
+
+ while (1) {
+ *err = -EIO;
+ bh = sb_bread(dir->i_sb, le64_to_cpu(frame->at->block));
+ if (!bh)
+ return NULL;
+
+ /* TODO: Verify leaf checksum */
+
+ scan_sorted_buf(key, dentry, bh, blocksize, lm);
+ if (lm->at != lm->last)
+ break;
+
+ retval = itree_next_frame(dir, key->inode, frames, frame, 0);
+ if (retval > 0)
+ break;
+
+ brelse(bh);
+ *err = retval;
+ if (retval < 0)
+ return NULL;
+ }
+ *err = 0;
+ return bh;
+}
+
+static int itree_add_entry(handle_t *handle, struct inode *dir,
+ ext4_fsblk_t root_blk, struct dentry *entry,
+ struct inode *inode, u32 hash)
+{
+ /* This will be called from ext4_dx_add_entry() */
+ struct itree_frame frames[ITREE_MAX_DEPTH];
+ struct itree_frame *frame;
+ struct buffer_head *bh, *bh2 = NULL, *target_bh;
+ ext4_fsblk_t new_block;
+ int err, continued = 0;
+ int new_len, len1, len2;
+
+ struct itree_leaf_entry *last, *first_new;
+ struct itree_leaf_head *head, *head2;
+ struct itree_key key, split_key;
+ void *split_point, *buf_end;
+ unsigned blocksize;
+ int new_rlen, last_off, fullness;
+ struct itree_leaf_map lm;
+ __le32 rlen_disk;
+
+ memset(frames, 0, ITREE_MAX_DEPTH*sizeof(struct itree_frame));
+
+ blocksize = dir->i_sb->s_blocksize;
+
+ key.inode = inode->i_ino;
+ key.hash = hash;
+
+ frame = itree_probe(&key, dir, root_blk, frames, &err);
+ if (!frame)
+ return err;
+
+ bh = itree_find_leaf_entry(dir, &key, entry, frames, frame, &lm, &err);
+ if (err)
+ goto out;
+
+ /* TODO Add count to the map and check for that instead */
+ if (lm.before_split && lm.split_at)
+ continued = (lm.before_split->dirent.inode ==
+ lm.split_at->dirent.inode) &&
+ (lm.before_split->hash == lm.split_at->hash);
+
+ buf_end = (void *)bh->b_data + blocksize;
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto out;
+
+ if (lm.free) {
+ new_rlen = put_entry_to_sorted_buf(&key, entry, inode, bh,
+ blocksize, &lm);
+
+ head = itree_leaf_get_head(bh);
+ fullness = itree_get_leaf_fullness(head, blocksize);
+ err = itree_update_fullness(handle, dir, frame, fullness);
+ if (err)
+ goto out;
+ } else {
+ err = -ENOSPC;
+ if (!itree_can_split(frame, frames))
+ goto out;
+
+ bh2 = ext4_new_itree_block(handle, dir, &new_block, &err);
+ if (!bh2)
+ goto out;
+
+ split_point = (void *)lm.split_at;
+ memcpy(bh2->b_data + sizeof(struct itree_leaf_head),
+ split_point, buf_end - split_point);
+
+ /* Fix the rec_len of the last entries */
+ new_rlen = buf_end - (void *)(&(lm.before_split->dirent));
+ rlen_disk = ext4_rec_len_to_disk(new_rlen, blocksize);
+ lm.before_split->dirent.rec_len = rlen_disk;
+
+ first_new = (struct itree_leaf_entry *)(bh2->b_data +
+ sizeof(struct itree_leaf_head));
+
+ last_off = (void *)lm.last - split_point;
+ last = (struct itree_leaf_entry *)((void *)first_new +
+ last_off);
+
+ buf_end = (void *)bh2->b_data + blocksize;
+ new_rlen = buf_end - (void *)(&(last->dirent));
+ last->dirent.rec_len = ext4_rec_len_to_disk(new_rlen,
+ blocksize);
+
+ /* TODO: Set checksums */
+
+ len1 = lm.used_length1;
+ len2 = lm.used_length2;
+
+ itree_leaf_entry_get_key(lm.split_at, &split_key);
+
+ head = itree_leaf_get_head(bh);
+ head2 = itree_leaf_get_head(bh2);
+
+ head->used_length = cpu_to_le16(len1);
+ head2->used_length = cpu_to_le16(len2);
+
+ target_bh = bh;
+ if ((void *)lm.at >= split_point)
+ target_bh = bh2;
+
+ scan_sorted_buf(&key, entry, target_bh, blocksize, &lm);
+ new_len = put_entry_to_sorted_buf(&key, entry, inode, target_bh,
+ blocksize, &lm);
+
+ if (target_bh == bh)
+ len1 += new_len;
+ else
+ len2 += new_len;
+
+ /* Add the keys to the itree_frame */
+ err = itree_node_insert_entry(handle, dir, &split_key,
+ new_block, continued, frames,
+ frame, len1, len2);
+ if (err) {
+ brelse(bh);
+ unlock_buffer(bh2);
+ brelse(bh2);
+ ext4_free_itree_block(handle, dir, new_block);
+ goto out;
+ }
+
+ set_buffer_uptodate(bh2);
+ unlock_buffer(bh2);
+ }
+
+ /* See add_dirent_to_buf() */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+ if (err)
+ ext4_std_error(dir->i_sb, err);
+ brelse(bh);
+
+ if (bh2) {
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ ext4_std_error(dir->i_sb, err);
+ brelse(bh2);
+ }
+
+ err = 0;
+out:
+ itree_release_frames(frames);
+
+ return err;
+}
+
static int itree_next_frame(struct inode *dir, u32 ino,
struct itree_frame *frames,
struct itree_frame *frame_in,
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 7/9] ext4: Adding itree implementation III - Deleting
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
` (5 preceding siblings ...)
2013-05-13 15:42 ` [RFC v2 6/9] ext4: Adding itree implementation II - Inserting Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 8/9] ext4: Make directory operations use itree Radek Pazdera
2013-05-13 15:42 ` [RFC v2 9/9] ext4: Make ext4_readdir() use itree if available Radek Pazdera
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit contains functions that are related to the deletion of
entries from the inode tree. This also includes the code related to
coalesce-on-delete.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 555 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 555 insertions(+)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index af350af..d5e09eb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -336,6 +336,9 @@ static int itree_add_entry(handle_t *handle, struct inode *dir,
ext4_fsblk_t root_blk, struct dentry *entry,
struct inode *inode, u32 hash);
+static int itree_delete_entry(handle_t *handle, struct inode *dir,
+ struct dentry *entry);
+
static int itree_next_frame(struct inode *dir, u32 ino,
struct itree_frame *frames,
struct itree_frame *frame_in,
@@ -4447,6 +4450,558 @@ static int itree_next_frame(struct inode *dir, u32 ino,
return 0;
}
+static int itree_search_leaf(struct buffer_head *bh, struct inode *dir,
+ u32 ino, u32 hash, const struct qstr *d_name,
+ struct itree_leaf_entry **res_entry,
+ struct itree_leaf_entry **prev_entry)
+{
+ struct itree_leaf_head *lh;
+ struct itree_leaf_entry *le;
+ struct ext4_dir_entry_2 *de;
+ char *data;
+ const char *name = d_name->name;
+ int namelen = d_name->len;
+ int blocksize = dir->i_sb->s_blocksize;
+ __le32 le_ino = cpu_to_le32(ino), le_hash = cpu_to_le32(hash);
+
+ lh = itree_leaf_get_head(bh);
+ le = itree_leaf_get_entries(bh);
+ data = (char *)le;
+
+ *res_entry = NULL;
+ *prev_entry = NULL;
+
+ itree_leaf_for_each(le, de, lh, blocksize) {
+ if (le->hash == le_hash &&
+ de->inode == le_ino &&
+ ext4_match(namelen, name, de)) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, data,
+ bh->b_size, 0/*offset*/))
+ return -1;
+ *res_entry = le;
+ return 1;
+ }
+ *prev_entry = le;
+ }
+ return 0;
+}
+
+static void itree_erease_leaf_entry(struct inode *dir, struct buffer_head *leaf,
+ struct itree_leaf_entry *entry,
+ struct itree_leaf_entry *prev)
+{
+ struct itree_leaf_head *head = itree_leaf_get_head(leaf);
+ int blocksize = dir->i_sb->s_blocksize;
+ int prev_rec_len, entry_len, old_used_length, used_length;
+
+ if (prev) {
+ prev_rec_len = ext4_rec_len_from_disk(prev->dirent.rec_len,
+ blocksize);
+ entry_len = itree_leaf_entry_get_len(entry, blocksize);
+ prev->dirent.rec_len = ext4_rec_len_to_disk(prev_rec_len +
+ entry_len,
+ blocksize);
+ }
+
+ used_length = itree_leaf_entry_min_len(entry);
+ old_used_length = le16_to_cpu(head->used_length);
+ head->used_length = cpu_to_le16(old_used_length - used_length);
+
+ entry->dirent.inode = 0;
+ dir->i_version++;
+}
+
+/* Returns the packed length of all the entries in the block */
+static int itree_leaf_pack_entries(struct inode *dir, struct buffer_head *leaf,
+ int *last_offset)
+{
+ struct itree_leaf_head *lh;
+ struct itree_leaf_entry *le, *next, *last = NULL, *entries;
+ struct ext4_dir_entry_2 *de;
+ void *new_pos, *buff_end;
+ int blocksize = dir->i_sb->s_blocksize;
+ int new_reclen, old_reclen, entry_len, len = 0;
+
+ *last_offset = 0;
+
+ lh = itree_leaf_get_head(leaf);
+ entries = itree_leaf_get_entries(leaf);
+
+ buff_end = (void *)lh + blocksize;
+ new_pos = (void *)entries;
+ le = entries;
+
+ while ((void *)le < buff_end) {
+ de = &(le->dirent);
+ next = itree_next_leaf_entry(le, blocksize);
+
+ if (!de->inode) {
+ le = next;
+ continue;
+ }
+
+ old_reclen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ new_reclen = EXT4_DIR_REC_LEN(de->name_len);
+ if (new_reclen < old_reclen)
+ de->rec_len = ext4_rec_len_to_disk(new_reclen,
+ blocksize);
+
+ entry_len = itree_leaf_entry_len(new_reclen);
+ len += entry_len;
+ if ((void *)le != new_pos)
+ memmove(new_pos, le, entry_len);
+ last = (struct itree_leaf_entry *)new_pos;
+ new_pos += entry_len;
+ le = next;
+ }
+
+ lh->used_length = cpu_to_le16(len);
+
+ if (last) {
+ new_reclen = buff_end - (void *)(&(last->dirent));
+ last->dirent.rec_len = ext4_rec_len_to_disk(new_reclen,
+ blocksize);
+ *last_offset = (void *)last - (void *)entries;
+ }
+
+ return len;
+}
+
+/* Decide if we can coalesce and which neighbour will be used. Returns 1 if
+ * coalescing is possible and 0 otherwise. The entry parameters will be
+ * filled with pointers to entries that should be merged, while entry1 is
+ * always a pointer to the first block with smaller keys. */
+static int itree_can_coalesce(struct itree_frame *frame,
+ struct itree_entry **entry1,
+ struct itree_entry **entry2)
+{
+ struct itree_node *node = (struct itree_node *)frame->bh->b_data;
+ struct itree_entry *neighbour;
+ int count = le16_to_cpu(node->count);
+
+ /* Coalesce with the next entry? */
+ neighbour = frame->at + 1;
+ if ((neighbour < (frame->entries + count)) &&
+ ((neighbour->fullness + frame->at->fullness) <= 255)) { /* FIXME */
+ *entry1 = frame->at;
+ *entry2 = neighbour;
+ return 1;
+ }
+
+ /* Coalesce with the previous entry? */
+ neighbour = frame->at - 1;
+ if ((neighbour >= frame->entries) &&
+ ((neighbour->fullness + frame->at->fullness) <= 255)) {
+ *entry1 = neighbour;
+ *entry2 = frame->at;
+ return 1;
+ }
+
+ /* No sir. */
+ return 0;
+}
+
+/*
+ * Move entries from both leaves to the first one. The first block must
+ * contain entries with smaller keys than the second one.
+ *
+ * The function returns the number of bytes used in block1 after the merge.
+ */
+static int itree_merge_leaf_nodes(handle_t *handle, struct inode *dir,
+ struct buffer_head *block1,
+ struct buffer_head *block2)
+{
+ struct itree_leaf_head *head;
+ struct itree_leaf_entry *last;
+ int blocksize = dir->i_sb->s_blocksize;
+ int last_offset1, last_offset2;
+ int len1, len2, rec_len, used_length;
+ void *data1, *data2;
+ int err;
+
+ BUFFER_TRACE(block1, "get_write_access");
+ err = ext4_journal_get_write_access(handle, block1);
+ if (err)
+ return err;
+
+ len1 = itree_leaf_pack_entries(dir, block1, &last_offset1);
+ len2 = itree_leaf_pack_entries(dir, block2, &last_offset2);
+
+ data1 = (void *)itree_leaf_get_entries(block1);
+ data2 = (void *)itree_leaf_get_entries(block2);
+ memcpy(data1 + len1, data2, len2);
+
+ last = (struct itree_leaf_entry *)(data1 + last_offset1);
+ rec_len = EXT4_DIR_REC_LEN(last->dirent.name_len);
+ last->dirent.rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+
+ last = (struct itree_leaf_entry *)(data1 + len1 + last_offset2);
+ rec_len = ((void *)block1->b_data + blocksize) -
+ (void *)(&(last->dirent));
+ last->dirent.rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+
+ head = itree_leaf_get_head(block2);
+ used_length = le16_to_cpu(head->used_length);
+
+ head = itree_leaf_get_head(block1);
+ used_length += le16_to_cpu(head->used_length);
+ head->used_length = cpu_to_le16(used_length);
+
+ /* TODO CHECKSUM */
+
+ err = ext4_handle_dirty_metadata(handle, dir, block1);
+ if (err)
+ return err;
+
+ return used_length;
+}
+
+/*
+ * This function removes frame->at + 1 entry from the index and
+ * coalesces the index if necessarry.
+ */
+static int itree_remove_from_index(handle_t *handle, struct inode *dir,
+ struct itree_frame *frame,
+ struct itree_frame *frames)
+{
+ struct itree_node *node, *node1, *node2;
+ struct buffer_head *node_bh, *neighbour_bh, *block1, *block2, *bh;
+ struct itree_entry *end, *entry, *entry1, *entry2;
+ ext4_fsblk_t blk;
+ int blocksize = dir->i_sb->s_blocksize;
+ int count, err = 0, count1, count2, fullness;
+
+ while (frame >= frames) {
+ node_bh = frame->bh;
+ node = (struct itree_node *)(node_bh->b_data);
+ count = le16_to_cpu(node->count);
+ entry = frame->at + 1;
+
+ /* First we update the frame */
+ BUFFER_TRACE(node_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, node_bh);
+ if (err)
+ return err;
+
+ end = frame->entries + count;
+ memmove(entry, entry + 1, (void *)end - (void *)(entry + 1));
+ node->count = cpu_to_le16(--count);
+
+ /*
+ * Remove tree level in case the root has only a single child
+ */
+ if (frame == frames && node->indirect_levels && count == 1) {
+ bh = sb_bread(dir->i_sb, le64_to_cpu(frame->at->block));
+ if (!bh)
+ return -EIO;
+ memcpy(node_bh->b_data, bh->b_data, blocksize);
+ ext4_free_blocks(handle, dir, bh, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, dir, node_bh);
+ if (err)
+ return err;
+
+ if (frame == frames)
+ return 0;
+
+ /* Don't coalesce, remove right away */
+ if (!count) {
+ ext4_free_blocks(handle, dir, frame->bh, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+ frame->bh = NULL;
+ frame->at = NULL;
+ frame->entries = NULL;
+
+ frame--;
+ frame->at--;
+ continue;
+ }
+
+ /* Can we coalesce? */
+ frame--;
+ if (!itree_can_coalesce(frame, &entry1, &entry2)) {
+ fullness = itree_get_node_fullness(node);
+ err = itree_update_fullness(handle, dir, frame,
+ fullness);
+ return err;
+ }
+
+ /* Get the neighbour block */
+ if (frame->at == entry1) {
+ blk = le64_to_cpu(entry2->block);
+ neighbour_bh = sb_bread(dir->i_sb, blk);
+ if (!neighbour_bh)
+ return -EIO;
+
+ block1 = node_bh;
+ block2 = neighbour_bh;
+
+ entry = frame->at + 1;
+ } else {
+ blk = le64_to_cpu(entry1->block);
+ neighbour_bh = sb_bread(dir->i_sb, blk);
+ if (!neighbour_bh)
+ return -EIO;
+
+ block1 = neighbour_bh;
+ block2 = node_bh;
+
+ entry = frame->at--;
+ }
+
+ BUFFER_TRACE(block1, "get_write_access");
+ err = ext4_journal_get_write_access(handle, block1);
+ if (err) {
+ brelse(block2);
+ return err;
+ }
+
+ node1 = (struct itree_node *)block1->b_data;
+ node2 = (struct itree_node *)block2->b_data;
+ count1 = le16_to_cpu(node1->count);
+ count2 = le16_to_cpu(node2->count);
+
+ memcpy(node1->entries + count1, node2->entries,
+ count2 * sizeof(struct itree_entry));
+ count = count1 + count2;
+ node1->count = cpu_to_le16(count);
+
+ if ((frame+1)->bh == block2) {
+ (frame+1)->bh = block1;
+ (frame+1)->entries = node1->entries;
+ (frame+1)->at = node1->entries + count1 +
+ ((frame+1)->at - (frame+1)->entries);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, dir, block1);
+ if (err) {
+ brelse(block2);
+ return err;
+ }
+
+ fullness = itree_get_node_fullness(node1);
+ err = itree_update_fullness(handle, dir, frame, fullness);
+
+ brelse(block2);
+ ext4_free_itree_block(handle, dir, le64_to_cpu(entry->block));
+ }
+
+ return err;
+}
+
+static int is_last_leaf(struct itree_frame *frame, struct itree_frame *frames)
+{
+ struct itree_node *node;
+ int count = 0;
+
+ while (frame >= frames) {
+ node = (struct itree_node *)(frame->bh->b_data);
+ count = le16_to_cpu(node->count);
+ if (count > 1)
+ return 0;
+
+ frame--;
+ }
+
+ return 1;
+}
+
+static int itree_do_delete_entry(handle_t *handle, struct inode *dir,
+ struct itree_leaf_entry *entry,
+ struct itree_leaf_entry *prev_entry,
+ struct buffer_head *leaf,
+ struct itree_frame *frame,
+ struct itree_frame *frames)
+{
+ struct itree_entry *entry1 = NULL, *entry2 = NULL;
+ struct itree_leaf_head *head;
+ struct buffer_head *neighbour, *block1, *block2;
+ int used_length = 0, err = 0, fullness;
+ int blocksize = dir->i_sb->s_blocksize;
+
+ BUFFER_TRACE(leaf, "get_write_access");
+ err = ext4_journal_get_write_access(handle, leaf);
+ if (err) {
+ brelse(leaf);
+ return err;
+ }
+
+ itree_erease_leaf_entry(dir, leaf, entry, prev_entry);
+
+ err = ext4_handle_dirty_metadata(handle, dir, leaf);
+ if (err) {
+ brelse(leaf);
+ return err;
+ }
+
+ head = (struct itree_leaf_head *)leaf->b_data;
+ fullness = itree_get_leaf_fullness(head, blocksize);
+ err = itree_update_fullness(handle, dir, frame, fullness);
+
+ /* No coalescing, remove the block right away */
+ used_length = le16_to_cpu(head->used_length);
+ if (!used_length && !is_last_leaf(frame, frames)) {
+ brelse(leaf);
+ ext4_free_blocks(handle, dir, leaf, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+ frame->at--;
+ err = itree_remove_from_index(handle, dir, frame, frames);
+ return err;
+ }
+
+ if (!itree_can_coalesce(frame, &entry1, &entry2))
+ return 0;
+
+ /* Get the neighbour leaf block */
+ if (frame->at == entry1) {
+ neighbour = sb_bread(dir->i_sb, le64_to_cpu(entry2->block));
+ if (!neighbour) {
+ brelse(leaf);
+ return -EIO;
+ }
+
+ block1 = leaf;
+ block2 = neighbour;
+ } else {
+ neighbour = sb_bread(dir->i_sb, le64_to_cpu(entry1->block));
+ if (!neighbour) {
+ brelse(leaf);
+ return -EIO;
+ }
+
+ block1 = neighbour;
+ block2 = leaf;
+
+ frame->at--;
+ }
+
+ head = itree_leaf_get_head(block2);
+ if (head->used_length) {
+ err = itree_merge_leaf_nodes(handle, dir, block1, block2);
+ if (err < 0) {
+ brelse(block1);
+ brelse(block2);
+ return err;
+ }
+
+ head = (struct itree_leaf_head *)block1->b_data;
+ fullness = itree_get_leaf_fullness(head, blocksize);
+ err = itree_update_fullness(handle, dir, frame, fullness);
+ }
+
+ brelse(block1);
+ brelse(block2);
+ ext4_free_itree_block(handle, dir, le64_to_cpu(entry2->block));
+
+ err = itree_remove_from_index(handle, dir, frame, frames);
+
+ return err;
+}
+
+static int itree_delete_entry(handle_t *handle, struct inode *dir,
+ struct dentry *dentry)
+{
+ /*
+ * TODO When deleting the last entry, look at the previous
+ * entry and if it is different, check the collission flag
+ * of the next block and clear it.
+ */
+
+ /* This will be called from ext4_delete_entry() */
+ struct itree_frame frames[ITREE_MAX_DEPTH];
+ struct itree_frame *frame;
+ struct buffer_head *bh, *leaf;
+ ext4_fsblk_t root_blk, leaf_blk;
+ int err, retval;
+ struct itree_leaf_entry *entry, *prev_entry;
+ struct dx_hash_info hinfo;
+ struct dx_root *root;
+ struct itree_key key;
+
+ /* TODO Split the finding code to itree_find_entry? */
+
+ memset(frames, 0, ITREE_MAX_DEPTH*sizeof(struct itree_frame));
+
+ /* Get itree root block */
+ bh = ext4_bread(NULL, dir, 0, 0, &err);
+ if (!bh)
+ return err;
+
+ root = (struct dx_root *) bh->b_data;
+
+ err = dx_get_itree_root(dir, (struct ext4_dir_entry *)bh->b_data,
+ &root_blk);
+ brelse(bh);
+ if (err)
+ return err;
+
+ /* TODO Checksum */
+
+ hinfo.hash_version = root->info.hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, &hinfo);
+
+ key.inode = dentry->d_inode->i_ino;
+ key.hash = hinfo.hash;
+
+ frame = itree_probe(&key, dir, root_blk, frames, &err);
+ if (!frame)
+ return err;
+
+ while (1) {
+ /* Get leaf */
+ leaf_blk = le64_to_cpu(frame->at->block);
+ err = -EIO;
+ leaf = sb_getblk(dir->i_sb, leaf_blk);
+ if (!leaf)
+ goto out; /* FIXME change GOTO's to breaks? */
+
+ /* TODO Checksum */
+
+ retval = itree_search_leaf(leaf, dir, key.inode, key.hash,
+ &(dentry->d_name), &entry,
+ &prev_entry);
+ if (retval == 1) {
+ /*
+ * The 'leaf' buffer head is released within this
+ * function. It might also invalidate the frames
+ * in case some blocks in the tree are merged.
+ */
+ err = itree_do_delete_entry(handle, dir, entry,
+ prev_entry, leaf,
+ frame, frames);
+ goto out;
+ } else if (retval == -1) {
+ err = -EIO;
+ brelse(leaf);
+ /* TODO: print error bad itree */
+ goto out;
+ }
+
+ /* Not found in the block. Could be collision. */
+ brelse(leaf);
+ retval = itree_next_frame(dir, key.inode, frames, frame, 0);
+ if (retval < 0) {
+ err = retval;
+ goto out;
+ }
+
+ err = -ENOENT;
+ if (retval > 0)
+ goto out;
+ }
+
+out:
+ itree_release_frames(frames);
+ return err;
+}
+
/*
* Arrange the dirents to the two new itree blocks in the order
* specified by the map. Returns 1 in case the split occured within
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 8/9] ext4: Make directory operations use itree
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
` (6 preceding siblings ...)
2013-05-13 15:42 ` [RFC v2 7/9] ext4: Adding itree implementation III - Deleting Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
2013-05-13 15:42 ` [RFC v2 9/9] ext4: Make ext4_readdir() use itree if available Radek Pazdera
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This commit adds some changes to the existing directory operations so
they use the itree as well, apart from the original index tree.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/namei.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 59 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d5e09eb..65312d3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2026,6 +2026,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block;
struct fake_dirent *fde;
int csum_size = 0;
+ ext4_fsblk_t itree_root_block;
if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -2058,9 +2059,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return PTR_ERR(bh2);
}
+
ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
- data1 = bh2->b_data;
+ ext4_set_inode_flag(dir, EXT4_INODE_ITREE);
+ data1 = bh2->b_data;
memcpy (data1, de, len);
de = (struct ext4_dir_entry_2 *) data1;
top = data1 + len;
@@ -2087,11 +2090,26 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
dx_set_count(entries, 1);
dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
- /* Initialize as for dx_probe */
hinfo.hash_version = root->info.hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_ITREE)) {
+ retval = itree_init(handle, dir, &hinfo,
+ (struct ext4_dir_entry_2 *)data1,
+ dentry, inode, &itree_root_block);
+ if (retval)
+ return retval;
+ retval = dx_set_itree_root(dir,
+ (struct ext4_dir_entry *)bh->b_data,
+ itree_root_block);
+ if (retval)
+ return retval;
+ }
+
+ /* Initialize as for dx_probe */
ext4fs_dirhash(name, namelen, &hinfo);
frame = frames;
frame->entries = entries;
@@ -2102,7 +2120,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
ext4_handle_dirty_dx_node(handle, dir, frame->bh);
ext4_handle_dirty_dirent_node(handle, dir, bh);
- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+ de = do_split(handle, dir, &bh, frame, &hinfo, &retval);
if (!de) {
/*
* Even if the block split failed, we have to properly write
@@ -2216,10 +2234,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
struct dx_hash_info hinfo;
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
struct inode *dir = dentry->d_parent->d_inode;
struct super_block *sb = dir->i_sb;
+ struct ext4_dir_entry *dirent;
struct ext4_dir_entry_2 *de;
+ ext4_fsblk_t itree_root;
int err;
frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
@@ -2227,6 +2247,24 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
return err;
entries = frame->entries;
at = frame->at;
+
+ /*
+ * XXX This currently fails without any attempt to recover,
+ * but we could also turn off the auxiliary tree and only
+ * print a warning.
+ */
+ if (dx_itree(dir)) {
+ dirent = (struct ext4_dir_entry *)frames[0].bh->b_data;
+ err = dx_get_itree_root(dir, dirent, &itree_root);
+ if (err)
+ goto cleanup;
+
+ err = itree_add_entry(handle, dir, itree_root,
+ dentry, inode, hinfo.hash);
+ if (err)
+ goto cleanup;
+ }
+
bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
@@ -2947,6 +2985,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
+ if (dx_itree(dir)) {
+ retval = itree_delete_entry(handle, dir, dentry);
+ if (retval)
+ goto end_rmdir;
+ }
+
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
@@ -3016,6 +3060,11 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
+ if (dx_itree(dir)) {
+ retval = itree_delete_entry(handle, dir, dentry);
+ if (retval)
+ goto end_unlink;
+ }
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
@@ -3355,6 +3404,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir->i_ino, old_dir->i_nlink, retval);
}
+ if (dx_itree(old_dir)) {
+ retval = itree_delete_entry(handle, old_dir, old_dentry);
+ if (retval)
+ goto end_rename;
+ }
+
if (new_inode) {
ext4_dec_count(handle, new_inode);
new_inode->i_ctime = ext4_current_time(new_inode);
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC v2 9/9] ext4: Make ext4_readdir() use itree if available
2013-05-13 15:42 [RFC v2 0/9] ext4: An Auxiliary Tree for the Directory Index Radek Pazdera
` (7 preceding siblings ...)
2013-05-13 15:42 ` [RFC v2 8/9] ext4: Make directory operations use itree Radek Pazdera
@ 2013-05-13 15:42 ` Radek Pazdera
8 siblings, 0 replies; 10+ messages in thread
From: Radek Pazdera @ 2013-05-13 15:42 UTC (permalink / raw)
To: linux-ext4; +Cc: lczerner, kasparek, Radek Pazdera
This patch adds an alternative implementation of ext4_readdir() that
will read the directory in inode order in case there is a inode tree
available in the directory index.
Signed-off-by: Radek Pazdera <rpazdera@redhat.com>
---
fs/ext4/dir.c | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
fs/ext4/ext4.h | 9 +++
fs/ext4/namei.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 345 insertions(+), 6 deletions(-)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4..4978430 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,8 @@
#include "ext4.h"
#include "xattr.h"
-static int ext4_dx_readdir(struct file *filp,
- void *dirent, filldir_t filldir);
+static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir);
+static int ext4_itree_readdir(struct file *filp, void *buf, filldir_t filldir);
/**
* Check if the given dir-inode refers to an htree-indexed directory
@@ -116,6 +116,12 @@ static int ext4_readdir(struct file *filp,
int ret = 0;
int dir_has_error = 0;
+ if (dx_itree(inode)) {
+ ret = ext4_itree_readdir(filp, dirent, filldir);
+ if (!ret)
+ goto out;
+ }
+
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -350,14 +356,16 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
}
/*
- * This structure holds the nodes of the red-black tree used to store
- * the directory entry in hash order.
+ * This structure holds the nodes of the red-black tree while reading
+ * the directory in hash order. It is also used to store collision
+ * chains while reading the directory in inode order.
*/
struct fname {
__u32 hash;
__u32 minor_hash;
struct rb_node rb_hash;
struct fname *next;
+ struct list_head cache_list;
__u32 inode;
__u8 name_len;
__u8 file_type;
@@ -613,10 +621,167 @@ finished:
return 0;
}
+/*
+ * While reading the directory from using the inode tree,
+ * the filp->f_pos offset is set to the current key, i.e.,
+ * the inode and the hash of the entry read. The LSB in the
+ * 32bit hash value is not used, so we use it to indicate
+ * the end of the directory file.
+ */
+#define ITREE_POS_HASH_OFF 0
+#define ITREE_POS_INODE_OFF 32
+#define ITREE_POS_EOF_BIT 1
+
+loff_t itree_get_pos(u32 ino, u32 hash)
+{
+ return ((loff_t)(ino) << ITREE_POS_INODE_OFF) |
+ ((loff_t)(hash) << ITREE_POS_HASH_OFF);
+}
+
+static u32 itree_pos_to_inode(loff_t pos)
+{
+ return (u32)(pos >> ITREE_POS_INODE_OFF);
+}
+
+static u32 itree_pos_to_hash(loff_t pos)
+{
+ return (u32)(pos >> ITREE_POS_HASH_OFF) & (~1);
+}
+
+struct dir_private_info *ext4_itree_create_dir_info(struct file *filp,
+ loff_t pos)
+{
+ struct dir_private_info *info;
+
+ info = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ if (!info)
+ return NULL;
+
+ INIT_LIST_HEAD(&(info->entry_cache));
+ info->curr_inode = itree_pos_to_inode(pos);
+ info->curr_hash = itree_pos_to_hash(pos);
+ return info;
+}
+
+void ext4_itree_free_entry_cache(struct dir_private_info *info)
+{
+ struct list_head *list = &(info->entry_cache);
+ struct fname *entry, *next;
+ list_for_each_entry_safe(entry, next, list, cache_list) {
+ list_del(&(entry->cache_list));
+ kfree(entry);
+ }
+}
+
+void ext4_itree_free_dir_info(struct dir_private_info *info)
+{
+ ext4_itree_free_entry_cache(info);
+ kfree(info);
+}
+
+/*
+ * Store an entry into the collision chain.
+ *
+ * This can occur in case the filldir buffer runs out during a
+ * collision between two entries in the tree. We must read them
+ * all at once and store them in memory, because the next round
+ * of filldir starting from this key would return some of the
+ * entries again.
+ */
+int itree_cache_entry(struct list_head *entry_cache, u32 hash,
+ struct ext4_dir_entry_2 *de)
+{
+ struct fname *entry;
+ int len = sizeof(struct fname) + de->name_len + 1;
+
+ entry = kzalloc(len, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->hash = hash;
+ entry->inode = le32_to_cpu(de->inode);
+ entry->file_type = de->file_type;
+ entry->name_len = de->name_len;
+ memcpy(entry->name, de->name, entry->name_len);
+
+ list_add_tail(&(entry->cache_list), entry_cache);
+ return 0;
+}
+
+int ext4_itree_readdir(struct file *filp, void *buf, filldir_t filldir)
+{
+ struct inode *dir = filp->f_path.dentry->d_inode;
+ struct dir_private_info *info = filp->private_data;
+ struct fname *entry, *next;
+ int retval = 0;
+ u32 ino, hash, new_ino, new_hash;
+ loff_t pos;
+
+ if (!info) {
+ info = ext4_itree_create_dir_info(filp, filp->f_pos);
+ if (!info)
+ return -ENOMEM;
+ filp->private_data = info;
+ }
+
+ /* Someone changed the position, drop the collision chain */
+ if (filp->f_pos != info->last_pos) {
+ ext4_itree_free_entry_cache(info);
+ info->curr_inode = itree_pos_to_inode(filp->f_pos);
+ info->curr_hash = itree_pos_to_hash(filp->f_pos);
+ }
+
+ /* Return the entries from the collision chain first */
+ if (!list_empty(&(info->entry_cache))) {
+ list_for_each_entry_safe(entry, next, &(info->entry_cache),
+ cache_list) {
+ retval = filldir(buf, entry->name, entry->name_len,
+ filp->f_pos, entry->inode,
+ get_dtype(dir->i_sb,
+ entry->file_type));
+ pos = itree_get_pos(entry->inode, entry->hash);
+ if (filp->f_pos & ITREE_POS_EOF_BIT)
+ pos |= ITREE_POS_EOF_BIT;
+ filp->f_pos = pos;
+ if (retval) {
+ if (retval == -EINVAL)
+ return 0; /* buffer full */
+ return retval;
+ }
+ list_del(&(entry->cache_list));
+ kfree(entry);
+ }
+ }
+
+ /* The end of the directory file */
+ if (filp->f_pos & ITREE_POS_EOF_BIT)
+ return 0;
+
+ ino = itree_pos_to_inode(filp->f_pos);
+ hash = itree_pos_to_hash(filp->f_pos);
+
+ /* Read entries from the itree */
+ retval = ext4_read_itree(dir, ino, hash, buf, filldir,
+ &(info->entry_cache), &new_ino, &new_hash);
+
+ filp->f_pos = itree_get_pos(new_ino, new_hash);
+ if (retval > 0) {
+ filp->f_pos |= ITREE_POS_EOF_BIT;
+ retval = 0;
+ }
+ info->last_pos = filp->f_pos;
+
+ return retval;
+}
+
static int ext4_release_dir(struct inode *inode, struct file *filp)
{
- if (filp->private_data)
- ext4_htree_free_dir_info(filp->private_data);
+ if (filp->private_data) {
+ if (dx_itree(inode))
+ ext4_itree_free_dir_info(filp->private_data);
+ else
+ ext4_htree_free_dir_info(filp->private_data);
+ }
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9512702..d86399f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1775,7 +1775,9 @@ struct dir_private_info {
struct rb_root root;
struct rb_node *curr_node;
struct fname *extra_fname;
+ struct list_head entry_cache;
loff_t last_pos;
+ __u32 curr_inode; /* Used only for itree */
__u32 curr_hash;
__u32 curr_minor_hash;
__u32 next_hash;
@@ -1988,6 +1990,9 @@ void ext4_insert_dentry(struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
const char *name, int namelen);
+extern loff_t itree_get_pos(u32 ino, u32 hash);
+extern int itree_cache_entry(struct list_head *entry_cache, u32 hash,
+ struct ext4_dir_entry_2 *de);
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
@@ -2156,6 +2161,10 @@ extern int ext4_generic_delete_entry(handle_t *handle,
int buf_size,
int csum_size);
+extern int ext4_read_itree(struct inode *dir, u32 ino, u32 hash, void *buf,
+ filldir_t filldir, struct list_head *entry_cache,
+ u32 *new_ino, u32 *new_hash);
+
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
struct ext4_new_group_data *input);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 65312d3..bf31f4d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -5246,6 +5246,171 @@ out:
return err;
}
+/*
+ * Returns negative on error, zero when the end of the block was
+ * reached, and positive in case the filldir buffer is full.
+ */
+static int itree_leaf_to_buffer(struct inode *dir, u32 start_ino,
+ u32 start_hash, ext4_fsblk_t leaf_blk,
+ void *buf, filldir_t filldir,
+ struct list_head *entry_cache,
+ int *continuation, u32 *last_ino,
+ u32 *last_hash)
+{
+ struct buffer_head *bh;
+ struct itree_leaf_head *lh;
+ struct itree_leaf_entry *le;
+ struct ext4_dir_entry_2 *de;
+ int blocksize, err = 0, retval;
+ loff_t pos;
+ u32 ino, hash;
+ int buffer_full = 0;
+
+ bh = sb_bread(dir->i_sb, leaf_blk);
+ if (!bh)
+ return -EIO;
+
+ blocksize = dir->i_sb->s_blocksize;
+ lh = itree_leaf_get_head(bh);
+
+ itree_leaf_for_each(le, de, lh, blocksize) {
+ ino = le32_to_cpu(de->inode);
+ hash = le32_to_cpu(le->hash);
+
+ if (buffer_full) {
+ err = 1; /* buffer full */
+ if (ino == *last_ino && hash == *last_hash) {
+ retval = itree_cache_entry(entry_cache,
+ le32_to_cpu(le->hash),
+ de);
+ if (retval) /* FIXME report error */
+ break;
+ *continuation = 1;
+ } else {
+ *continuation = 0;
+ break;
+ }
+ } else if (ino && (ino > start_ino || (ino == start_ino &&
+ hash > start_hash))) {
+ pos = itree_get_pos(ino, hash);
+ retval = filldir(buf, de->name, de->name_len, pos, ino,
+ get_dtype(dir->i_sb, de->file_type));
+ if (retval) {
+ err = 1;
+ if (retval != -EINVAL) { /* error */
+ err = retval;
+ break;
+ }
+ /* buffer full */
+ buffer_full = 1;
+ continue;
+ }
+ *last_ino = ino;
+ *last_hash = hash;
+ }
+ }
+
+ brelse(bh);
+ return err;
+}
+
+int ext4_read_itree(struct inode *dir, u32 ino, u32 hash, void *buf,
+ filldir_t filldir, struct list_head *entry_cache,
+ u32 *new_ino, u32 *new_hash)
+{
+ struct itree_frame frames[ITREE_MAX_DEPTH];
+ struct itree_frame *frame;
+ struct buffer_head *bh;
+ struct itree_key key;
+ struct dx_root *dxr;
+ ext4_fsblk_t root_blk, leaf_blk;
+ int err = 0, retval;
+ int cont = 0;
+
+ /* TODO Split the finding code to itree_find_entry? */
+ /* TODO Merge the finding code with itree_delete_entry()*/
+
+ memset(frames, 0, ITREE_MAX_DEPTH*sizeof(struct itree_frame));
+
+ key.inode = ino;
+ key.hash = hash;
+
+ /* Get itree root block */
+ bh = ext4_bread(NULL, dir, 0, 0, &err);
+ if (!bh)
+ return err;
+
+ /* FIXME: Check if it is still a itree dir */
+
+ dxr = (struct dx_root *)bh->b_data;
+
+ /* . */
+ if (!ino && (hash < 2)) {
+ retval = filldir(buf, dxr->dot_name, dxr->dot.name_len,
+ itree_get_pos(0, hash),
+ le32_to_cpu(dxr->dot.inode),
+ get_dtype(dir->i_sb, dxr->dot.file_type));
+ *new_hash = hash + 2;
+ if (retval) {
+ if (retval == -EINVAL)
+ return 0;
+ return retval;
+ }
+ }
+
+ /* .. */
+ if (!ino && (hash < 4)) {
+ retval = filldir(buf, dxr->dotdot_name, dxr->dotdot.name_len,
+ itree_get_pos(0, hash),
+ le32_to_cpu(dxr->dotdot.inode),
+ get_dtype(dir->i_sb, dxr->dotdot.file_type));
+ *new_hash = hash + 4;
+ if (retval) {
+ if (retval == -EINVAL)
+ return 0;
+ return retval;
+ }
+ }
+
+ err = dx_get_itree_root(dir, (struct ext4_dir_entry *)bh->b_data,
+ &root_blk);
+ brelse(bh);
+ if (err)
+ return err;
+
+ frame = itree_probe(&key, dir, root_blk, frames, &err);
+ if (!frame)
+ return err;
+
+ while (1) {
+ leaf_blk = le64_to_cpu(frame->at->block);
+ retval = itree_leaf_to_buffer(dir, ino, hash, leaf_blk, buf,
+ filldir, entry_cache, &cont,
+ new_ino, new_hash);
+
+ /* error */
+ if (retval < 0) {
+ err = retval;
+ break;
+ }
+
+ /* buffer full */
+ if (retval > 0 && !cont) {
+ err = 0;
+ break;
+ }
+
+ /* retval == 0; get next block */
+ err = itree_next_frame(dir, ino, frames, frame, 1);
+ if (err)
+ goto out;
+ }
+
+out:
+ itree_release_frames(frames);
+ return err;
+}
+
#ifdef ITREE_DEBUG
static void itree_show_node(struct buffer_head *bh)
{
--
1.7.11.7
^ permalink raw reply related [flat|nested] 10+ messages in thread