* [PATCH v2 1/3] ext4: return lblk from ext4_find_entry
@ 2020-04-07 6:46 Harshad Shirwadkar
2020-04-07 6:46 ` [PATCH v2 2/3] ext4: shrink directories on dentry delete Harshad Shirwadkar
2020-04-07 6:46 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Harshad Shirwadkar
0 siblings, 2 replies; 13+ messages in thread
From: Harshad Shirwadkar @ 2020-04-07 6:46 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger, Harshad Shirwadkar
This patch makes ext4_find_entry and related routines to return
logical block address of the dirent block. This logical block address
is used in the directory shrinking code to perform reverse lookup and
verify that the lookup was successful.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/namei.c | 54 +++++++++++++++++++++++++++++--------------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b05ea72f38fd..d567b9589875 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -295,7 +295,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
__u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
- struct ext4_dir_entry_2 **res_dir);
+ struct ext4_dir_entry_2 **res_dir, ext4_lblk_t *lblk);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -1443,7 +1443,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
static struct buffer_head *__ext4_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir,
- int *inlined)
+ int *inlined, ext4_lblk_t *lblk)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1485,7 +1485,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
goto restart;
}
if (is_dx(dir)) {
- ret = ext4_dx_find_entry(dir, fname, res_dir);
+ ret = ext4_dx_find_entry(dir, fname, res_dir, lblk);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
@@ -1588,7 +1588,8 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
static struct buffer_head *ext4_find_entry(struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
- int *inlined)
+ int *inlined,
+ ext4_lblk_t *lblk)
{
int err;
struct ext4_filename fname;
@@ -1600,7 +1601,7 @@ static struct buffer_head *ext4_find_entry(struct inode *dir,
if (err)
return ERR_PTR(err);
- bh = __ext4_find_entry(dir, &fname, res_dir, inlined);
+ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lblk);
ext4_fname_free_filename(&fname);
return bh;
@@ -1620,7 +1621,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
if (err)
return ERR_PTR(err);
- bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
+ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL);
ext4_fname_free_filename(&fname);
return bh;
@@ -1628,7 +1629,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
- struct ext4_dir_entry_2 **res_dir)
+ struct ext4_dir_entry_2 **res_dir,
+ ext4_lblk_t *lblk)
{
struct super_block * sb = dir->i_sb;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
@@ -1675,6 +1677,8 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
errout:
dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name));
success:
+ if (lblk)
+ *lblk = block;
dx_release(frames);
return bh;
}
@@ -1743,7 +1747,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+ bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL, NULL);
if (IS_ERR(bh))
return ERR_CAST(bh);
if (!bh)
@@ -2495,7 +2499,7 @@ int ext4_generic_delete_entry(handle_t *handle,
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh)
+ struct buffer_head *bh, ext4_lblk_t lblk)
{
int err, csum_size = 0;
@@ -3091,6 +3095,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
+ ext4_lblk_t lblk;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
@@ -3105,7 +3110,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return retval;
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, &lblk);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
@@ -3132,7 +3137,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, lblk);
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
@@ -3178,6 +3183,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
+ ext4_lblk_t lblk;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
@@ -3193,7 +3199,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
return retval;
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, &lblk);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
@@ -3216,7 +3222,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, lblk);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
@@ -3564,19 +3570,20 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ ext4_lblk_t lblk;
- bh = ext4_find_entry(dir, d_name, &de, NULL);
+ bh = ext4_find_entry(dir, d_name, &de, NULL, &lblk);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (bh) {
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, lblk);
brelse(bh);
}
return retval;
}
static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
- int force_reread)
+ int force_reread, ext4_lblk_t lblk)
{
int retval;
/*
@@ -3593,7 +3600,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
} else {
- retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+ retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh,
+ lblk);
if (retval == -ENOENT) {
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
@@ -3679,6 +3687,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *whiteout = NULL;
int credits;
u8 old_file_type;
+ ext4_lblk_t lblk;
if (new.inode && new.inode->i_nlink == 0) {
EXT4_ERROR_INODE(new.inode,
@@ -3706,7 +3715,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
return retval;
}
- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL,
+ &lblk);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
/*
@@ -3720,7 +3730,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
- &new.de, &new.inlined);
+ &new.de, &new.inlined, NULL);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
@@ -3817,7 +3827,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
/*
* ok, that's it
*/
- ext4_rename_delete(handle, &old, force_reread);
+ ext4_rename_delete(handle, &old, force_reread, lblk);
}
if (new.inode) {
@@ -3900,7 +3910,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return retval;
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
- &old.de, &old.inlined);
+ &old.de, &old.inlined, NULL);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
/*
@@ -3914,7 +3924,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
- &new.de, &new.inlined);
+ &new.de, &new.inlined, NULL);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
--
2.26.0.292.g33ef6b2f38-goog
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 2/3] ext4: shrink directories on dentry delete
2020-04-07 6:46 [PATCH v2 1/3] ext4: return lblk from ext4_find_entry Harshad Shirwadkar
@ 2020-04-07 6:46 ` Harshad Shirwadkar
2020-04-07 16:51 ` Andreas Dilger
2020-04-07 6:46 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Harshad Shirwadkar
1 sibling, 1 reply; 13+ messages in thread
From: Harshad Shirwadkar @ 2020-04-07 6:46 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger, Harshad Shirwadkar
This patch adds shrinking support for htree based directories. The
high level algorithm is as follows:
* If after dentry removal the dirent block (let's call it B) becomes
empty, then remove its references in its dx parent.
* Swap its contents with that of the last block (L) in directory.
* Update L's parents to point to B instead.
* Remove L
* Repeat this for all the ancestors of B.
We add variants of dx_probe that allow us perform reverse lookups from
a logical block to its dx parents.
Ran kvm-xfstests smoke and verified that no new failures are
introduced. Ran shrinking for directories with following number of
files and then deleted files one by one:
* 1000 (size before deletion 36K, after deletion 4K)
* 10000 (size before deletion 196K, after deletion 4K)
* 100000 (size before deletion 2.1M, after deletion 4K)
* 200000 (size before deletion 4.2M, after deletion 4K)
In all cases directory shrunk significantly. We fallback to linear
directories if the directory becomes empty.
But note that most of the shrinking happens during last 1-2% deletions
in an average case. Therefore, the next step here is to merge dx nodes
when possible. That can be achieved by storing the fullness index in
htree nodes.
Performance Testing:
-------------------
Created 1 million files and unlinked all of them. Did this with and without
directory shrinking. Journalling was on. Used ftrace to measure time
spent in ext4_unlink:
* Without directory shrinking
Size Before: 22M /mnt/1
1000000 files created and deleted.
Average Total Elapsed Time (across 3 runs): 43.787s
Size After: 22M /mnt/1
useconds #count
------------------------
0-9 3690
10-19 2790131
20-29 179209
30-39 14270
40-49 7981
50-59 2212
60-69 1132
70-79 703
80-89 403
90-99 274
Num samples > 40us: 12615
* With directory shrinking
Size Before: 22M /mnt/1
1000000 files created and deleted.
Average Total Elapsed Time(across 3 runs): 44.230s
Size After: 4.0K /mnt/1
useconds #count
-----------------------
0-9 3316
10-19 2786451
20-29 172015
30-39 13259
40-49 17847
50-59 4843
60-69 924
70-79 569
80-89 390
90-99 389
Num samples > 40us: 24962
We see doubled number of samples of >40us in case of directory shrinking.
Because these constitute to < 1% of total samples, the overall effect of
direcotry shrinking on unlink / rmdir performance is negligible. There
was no notable difference in cpu usage.
This patch supersedes the other directory shrinking patch sent in Aug
2019 ("ext4: attempt to shrink directory on dentry removal").
Changes since V1:
* ext4_remove_dx_entry(), dx_probe_dx_node() fixes
* dx_probe_dirent_blk() continuation fix
* Added performance evaluation
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
fs/ext4/ext4.h | 3 +-
fs/ext4/ext4_jbd2.h | 7 +
fs/ext4/inline.c | 2 +-
fs/ext4/namei.c | 357 ++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 356 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 61b37a052052..320e34b141f6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2746,7 +2746,8 @@ extern int ext4_generic_delete_entry(handle_t *handle,
struct buffer_head *bh,
void *entry_buf,
int buf_size,
- int csum_size);
+ int csum_size,
+ bool *empty);
extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 7ea4f6fa173b..5b18276c7321 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -83,6 +83,13 @@
*/
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
+/*
+ * Number of credits needed to perform directory shrinking. For each htree
+ * level, we need 6 blocks (block in question, its parent, last block, last
+ * block's parent, bitmap block, bg summary).
+ */
+#define EXT4_DIR_SHRINK_TRANS_BLOCKS 6U
+
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index fad82d08fca5..3650a316e431 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1706,7 +1706,7 @@ int ext4_delete_inline_entry(handle_t *handle,
goto out;
err = ext4_generic_delete_entry(handle, dir, de_del, bh,
- inline_start, inline_size, 0);
+ inline_start, inline_size, 0, NULL);
if (err)
goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d567b9589875..39360c442dad 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -275,10 +275,12 @@ static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct ext4_filename *fname,
+static struct dx_frame *__dx_probe(struct ext4_filename *fname,
struct inode *dir,
struct dx_hash_info *hinfo,
- struct dx_frame *frame);
+ struct dx_frame *frame,
+ ext4_lblk_t lblk_end,
+ bool shrink);
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
unsigned blocksize, struct dx_hash_info *hinfo,
@@ -747,15 +749,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
/*
* Probe for a directory leaf block to search.
*
- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * __dx_probe can return ERR_BAD_DX_DIR, which means there was a format
* error in the directory index, and the caller should fall back to
- * searching the directory normally. The callers of dx_probe **MUST**
+ * searching the directory normally. The callers of __dx_probe **MUST**
* check for this error code, and make sure it never gets reflected
- * back to userspace.
+ * back to userspace. If lblk_end is set and if __dx_probe hits lblk_end
+ * logical block, it reads it and stops. If strict is true, __dx_probe
+ * performs stronger checks for "count" during the lookup. See other
+ * variants below.
*/
static struct dx_frame *
-dx_probe(struct ext4_filename *fname, struct inode *dir,
- struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+__dx_probe(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in,
+ ext4_lblk_t lblk_end, bool strict)
{
unsigned count, indirect;
struct dx_entry *at, *entries, *p, *q, *m;
@@ -820,7 +826,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
dxtrace(printk("Look up %x", hash));
while (1) {
count = dx_get_count(entries);
- if (!count || count > dx_get_limit(entries)) {
+ /* If we are called from the shrink path */
+ if (count > dx_get_limit(entries) ||
+ (strict && !count)) {
ext4_warning_inode(dir,
"dx entry: count %u beyond limit %u",
count, dx_get_limit(entries));
@@ -859,7 +867,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
- if (!indirect--)
+ if (!indirect-- || dx_get_block(at) == lblk_end)
return frame;
frame++;
frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
@@ -889,6 +897,134 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
return ret_err;
}
+static inline struct dx_frame *
+dx_probe(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+ return __dx_probe(fname, dir, hinfo, frame_in, 0, true);
+}
+
+/*
+ * dx_probe with relaxed checks. This function is used in the directory
+ * shrinking code since we can run into intermediate states where we have
+ * internal dx nodes with count = 0.
+ */
+static inline struct dx_frame *
+dx_probe_relaxed(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+ return __dx_probe(fname, dir, hinfo, frame_in, 0, false);
+}
+
+/* dx_probe() variant that allows us to lookup frames for a dirent block. */
+struct dx_frame *dx_probe_dirent_blk(struct inode *dir, struct dx_frame *frames,
+ struct buffer_head *bh, ext4_lblk_t lblk)
+{
+ struct dx_hash_info hinfo;
+ struct dx_frame *frame_ptr;
+ struct ext4_dir_entry_2 *dead_de;
+
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+
+ /* Let's assume that the lblk is a leaf dirent block */
+ dead_de = (struct ext4_dir_entry_2 *)bh->b_data;
+ ext4fs_dirhash(dir, dead_de->name, dead_de->name_len, &hinfo);
+
+ frame_ptr = dx_probe_relaxed(NULL, dir, &hinfo, frames);
+ if (!IS_ERR(frame_ptr)) {
+ /*
+ * See if we need to move our frame_ptr.
+ */
+ while (dx_get_block(frame_ptr->at) != lblk) {
+ if (!ext4_htree_next_block(dir, hinfo.hash, frame_ptr,
+ frames, NULL))
+ break;
+ }
+ if (dx_get_block(frame_ptr->at) == lblk)
+ return frame_ptr;
+ dx_release(frames);
+ frame_ptr = ERR_PTR(-EINVAL);
+ }
+ return frame_ptr;
+}
+
+/* dx_probe() variant that allows us to lookup frames for a dx node. */
+struct dx_frame *dx_probe_dx_node(struct inode *dir, struct dx_frame *frames,
+ struct buffer_head *bh, ext4_lblk_t lblk)
+{
+ struct dx_hash_info hinfo;
+ struct dx_frame *frame_ptr;
+ struct dx_entry *entries;
+
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ entries = ((struct dx_node *)(bh->b_data))->entries;
+
+ /* Use the first hash found in the dx node */
+ hinfo.hash = le32_to_cpu(entries[1].hash);
+ frame_ptr = __dx_probe(NULL, dir, &hinfo, frames, lblk, false);
+ if (IS_ERR_OR_NULL(frame_ptr))
+ return frame_ptr;
+ if (dx_get_block(frame_ptr->at) != lblk) {
+ dx_release(frames);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return frame_ptr;
+}
+
+/*
+ * This function tries to remove the entry of a dirent block (which was just
+ * emptied by the caller) from the dx frame. It does so by reducing the count by
+ * 1 and left shifting all the entries after the deleted entry.
+ * ext4_remove_dx_entry() does NOT mark dx_frame dirty, that's left to the
+ * caller. The reason for doing this is that this function removes entry from
+ * a dx_node and can potentially leave dx_node with no entries. Since older
+ * kernels don't allow dx_parent nodes to have 0 count, the caller should
+ * only mark buffer dirty when we are sure that we won't leave dx_node
+ * with 0 count.
+ */
+int
+ext4_remove_dx_entry(handle_t *handle, struct inode *dir,
+ struct dx_frame *dx_frame)
+{
+ struct dx_entry *entries = dx_frame->entries;
+ int err, i = 0;
+ unsigned int count = dx_get_count(entries);
+
+ for (i = 0; i < count; i++)
+ if (entries[i].block == dx_frame->at->block)
+ break;
+ if (i >= count)
+ return -EINVAL;
+
+ err = ext4_journal_get_write_access(handle, dx_frame->bh);
+ if (err)
+ goto out_err;
+
+ if (i == 0) {
+ entries[0].block = entries[1].block;
+ i++;
+ }
+ if (i < count - 1)
+ memmove(&entries[i], &entries[i + 1], (count - i - 1) *
+ sizeof(struct dx_entry));
+ dx_set_count(entries, count - 1);
+
+ return 0;
+out_err:
+ ext4_std_error(dir->i_sb, err);
+ return -EINVAL;
+
+}
+
static void dx_release(struct dx_frame *frames)
{
struct dx_root_info *info;
@@ -2453,6 +2589,24 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
return err;
}
+/*
+ * Checks if dirent block is empty. If de is passed, we
+ * skip directory entries before de in the block.
+ */
+static inline bool is_empty_dirent_block(struct inode *dir,
+ struct buffer_head *bh,
+ struct ext4_dir_entry_2 *de)
+{
+ if (!de)
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ while ((u8 *)de < (u8 *)bh->b_data + dir->i_sb->s_blocksize) {
+ if (de->inode)
+ return false;
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ }
+ return true;
+}
+
/*
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
@@ -2463,7 +2617,8 @@ int ext4_generic_delete_entry(handle_t *handle,
struct buffer_head *bh,
void *entry_buf,
int buf_size,
- int csum_size)
+ int csum_size,
+ bool *empty)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
@@ -2472,6 +2627,8 @@ int ext4_generic_delete_entry(handle_t *handle,
i = 0;
pde = NULL;
de = (struct ext4_dir_entry_2 *)entry_buf;
+ if (empty)
+ *empty = false;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size, i))
@@ -2486,6 +2643,9 @@ int ext4_generic_delete_entry(handle_t *handle,
blocksize);
else
de->inode = 0;
+ if (empty)
+ *empty = is_empty_dirent_block(dir, bh,
+ pde ? pde : de);
inode_inc_iversion(dir);
return 0;
}
@@ -2496,12 +2656,183 @@ int ext4_generic_delete_entry(handle_t *handle,
return -ENOENT;
}
+static void make_unindexed(handle_t *handle, struct inode *dir,
+ struct buffer_head *bh)
+{
+ struct ext4_dir_entry_2 *de;
+ int parent_ino, csum_size = 0;
+
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ parent_ino = le32_to_cpu(
+ ((struct dx_root *)bh->b_data)->dotdot.inode);
+ if (ext4_has_metadata_csum(dir->i_sb))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+ memset(bh->b_data, 0, dir->i_sb->s_blocksize);
+ ext4_init_dot_dotdot(dir, de, dir->i_sb->s_blocksize, csum_size,
+ parent_ino, 0);
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh, dir->i_sb->s_blocksize);
+ ext4_handle_dirty_dirblock(handle, dir, bh);
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+ ext4_mark_inode_dirty(handle, dir);
+}
+
+/*
+ * Copy contents from lblk_src to lblk_dst and remap internal dx nodes
+ * such that parent of lblk_src points to lblk_dst.
+ */
+static int ext4_dx_remap_block(handle_t *handle, struct inode *dir,
+ struct buffer_head *bh_dst,
+ ext4_lblk_t lblk_dst, ext4_lblk_t lblk_src)
+{
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame_ptr;
+ struct buffer_head *bh_src;
+ int ret;
+
+ bh_src = ext4_bread(handle, dir, lblk_src, 0);
+ if (IS_ERR_OR_NULL(bh_src))
+ return -EIO;
+
+ memset(frames, 0, sizeof(frames));
+
+ /*
+ * blk_src can either be a dirent block or dx node. Try to search
+ * using both and use whichever one that returns success.
+ */
+ frame_ptr = dx_probe_dirent_blk(dir, frames, bh_src, lblk_src);
+ if (IS_ERR_OR_NULL(frame_ptr)) {
+ frame_ptr = dx_probe_dx_node(dir, frames, bh_src, lblk_src);
+ if (IS_ERR_OR_NULL(frame_ptr)) {
+ brelse(bh_src);
+ return PTR_ERR(frame_ptr);
+ }
+ }
+ WARN_ON(dx_get_block(frame_ptr->at) != lblk_src);
+ memcpy(bh_dst->b_data, bh_src->b_data, bh_dst->b_size);
+ dx_set_block(frame_ptr->at, lblk_dst);
+
+ /* Set pointer in bh_last_parent to bh */
+ ret = ext4_journal_get_write_access(handle, bh_src);
+ if (ret)
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, frame_ptr->bh);
+ if (ret)
+ goto out;
+
+ ret = ext4_handle_dirty_metadata(handle, dir, bh_src);
+ if (ret) {
+ ext4_std_error(dir->i_sb, ret);
+ goto out;
+ }
+
+ ret = ext4_handle_dirty_dx_node(handle, dir, frame_ptr->bh);
+ if (ret) {
+ ext4_std_error(dir->i_sb, ret);
+ goto out;
+ }
+out:
+ brelse(bh_src);
+ dx_release(frames);
+ return ret;
+}
+
+/*
+ * Try to shrink directory as much as possible. This function
+ * truncates directory to the new size.
+ *
+ * The high level algorithm is as follows:
+ *
+ * - If after dentry removal the dirent block (let's call it B) becomes
+ * empty, then remove its references in its dx parent.
+ * - Swap its contents with that of the last block (L) in directory.
+ * - Update L's parents to point to B instead.
+ * - Remove L
+ * - Repeat this for all the ancestors of B.
+ */
+static int ext4_try_dir_shrink(handle_t *handle, struct inode *dir,
+ ext4_lblk_t lblk, struct buffer_head *bh)
+{
+ struct buffer_head *bh_last = NULL, *parent_bh = NULL;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame_ptr;
+ ext4_lblk_t shrink = 0, last_lblk, parent_lblk;
+ int ret = 0;
+ bool check_empty = true;
+
+ memset(frames, 0, sizeof(frames));
+ frame_ptr = dx_probe_dirent_blk(dir, frames, bh, lblk);
+ if (IS_ERR(frame_ptr))
+ return -EINVAL;
+
+ last_lblk = (dir->i_size >> dir->i_sb->s_blocksize_bits) - 1;
+
+ while (check_empty && frame_ptr >= frames) {
+ parent_lblk = frame_ptr > frames ?
+ dx_get_block((frame_ptr - 1)->at) : 0;
+ lblk = dx_get_block(frame_ptr->at);
+ if (ext4_journal_extend(
+ handle, EXT4_DIR_SHRINK_TRANS_BLOCKS, 0) < 0)
+ break;
+ if (ext4_remove_dx_entry(handle, dir, frame_ptr) < 0) {
+ dxtrace(pr_warn("remove dx entry failed\n"));
+ break;
+ }
+ check_empty = dx_get_count(frame_ptr->entries) == 0;
+ if (lblk != last_lblk) {
+ ret = ext4_dx_remap_block(handle, dir, bh, lblk,
+ last_lblk);
+ if (ret) {
+ dxtrace(pr_warn("remap failed\n"));
+ break;
+ }
+ }
+ ret = ext4_handle_dirty_dx_node(handle, dir, frame_ptr->bh);
+ if (ret)
+ break;
+ if (parent_lblk == last_lblk)
+ parent_lblk = lblk;
+ last_lblk--;
+ shrink++;
+
+ if (parent_lblk == 0)
+ break;
+ if (!IS_ERR_OR_NULL(parent_bh))
+ brelse(parent_bh);
+ parent_bh = ext4_bread(handle, dir, parent_lblk, 0);
+ if (IS_ERR_OR_NULL(parent_bh))
+ break;
+ dx_release(frames);
+ frame_ptr = dx_probe_dx_node(dir, frames, parent_bh,
+ parent_lblk);
+ if (IS_ERR(frame_ptr))
+ break;
+ bh = parent_bh;
+ }
+
+ /* Fallback to linear directories if root node becomes empty */
+ if (dx_get_count(frames[0].entries) == 0)
+ make_unindexed(handle, dir, frames[0].bh);
+ if (!IS_ERR(frame_ptr))
+ dx_release(frames);
+ if (!IS_ERR_OR_NULL(bh_last))
+ brelse(bh_last);
+ if (!IS_ERR_OR_NULL(parent_bh))
+ brelse(parent_bh);
+
+ if (ret || !shrink)
+ return ret;
+
+ dir->i_size -= shrink * dir->i_sb->s_blocksize;
+ return ext4_truncate(dir);
+}
+
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh, ext4_lblk_t lblk)
{
int err, csum_size = 0;
+ bool block_empty = false;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
@@ -2521,7 +2852,8 @@ static int ext4_delete_entry(handle_t *handle,
err = ext4_generic_delete_entry(handle, dir, de_del,
bh, bh->b_data,
- dir->i_sb->s_blocksize, csum_size);
+ dir->i_sb->s_blocksize, csum_size,
+ &block_empty);
if (err)
goto out;
@@ -2530,6 +2862,9 @@ static int ext4_delete_entry(handle_t *handle,
if (unlikely(err))
goto out;
+ if (block_empty && is_dx(dir))
+ ext4_try_dir_shrink(handle, dir, lblk, bh);
+
return 0;
out:
if (err != -ENOENT)
--
2.26.0.292.g33ef6b2f38-goog
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2020-04-07 6:46 [PATCH v2 1/3] ext4: return lblk from ext4_find_entry Harshad Shirwadkar
2020-04-07 6:46 ` [PATCH v2 2/3] ext4: shrink directories on dentry delete Harshad Shirwadkar
@ 2020-04-07 6:46 ` Harshad Shirwadkar
2020-04-07 17:21 ` Andreas Dilger
[not found] ` <c7a41ba13a3551fca25d7498b9d4542a104fac74.camel@gmail.com>
1 sibling, 2 replies; 13+ messages in thread
From: Harshad Shirwadkar @ 2020-04-07 6:46 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger, Harshad Shirwadkar
The new function added in this patchset adds an efficient way to
check if a dirent block is empty. Based on that, reimplement
ext4_empty_dir().
This is a new patch added in V2.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
fs/ext4/namei.c | 39 ++++++++++++++++-----------------------
1 file changed, 16 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 39360c442dad..dae7d15fba5c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3179,6 +3179,7 @@ bool ext4_empty_dir(struct inode *inode)
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
+ ext4_lblk_t lblk;
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
@@ -3218,34 +3219,26 @@ bool ext4_empty_dir(struct inode *inode)
brelse(bh);
return true;
}
- offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
- while (offset < inode->i_size) {
- if (!(offset & (sb->s_blocksize - 1))) {
- unsigned int lblock;
- brelse(bh);
- lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- bh = ext4_read_dirblock(inode, lblock, EITHER);
- if (bh == NULL) {
- offset += sb->s_blocksize;
- continue;
- }
- if (IS_ERR(bh))
- return true;
- }
- de = (struct ext4_dir_entry_2 *) (bh->b_data +
- (offset & (sb->s_blocksize - 1)));
- if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
+ de = ext4_next_entry(de, sb->s_blocksize);
+ if (!is_empty_dirent_block(inode, bh, de)) {
+ brelse(bh);
+ return false;
+ }
+ brelse(bh);
+
+ for (lblk = 1; lblk < inode->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ lblk++) {
+ bh = ext4_read_dirblock(inode, lblk, EITHER);
+ if (bh == NULL)
continue;
- }
- if (le32_to_cpu(de->inode)) {
+ if (IS_ERR(bh))
+ return true;
+ if (!is_empty_dirent_block(inode, bh, NULL)) {
brelse(bh);
return false;
}
- offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ brelse(bh);
}
- brelse(bh);
return true;
}
--
2.26.0.292.g33ef6b2f38-goog
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v2 2/3] ext4: shrink directories on dentry delete
2020-04-07 6:46 ` [PATCH v2 2/3] ext4: shrink directories on dentry delete Harshad Shirwadkar
@ 2020-04-07 16:51 ` Andreas Dilger
0 siblings, 0 replies; 13+ messages in thread
From: Andreas Dilger @ 2020-04-07 16:51 UTC (permalink / raw)
To: Harshad Shirwadkar; +Cc: linux-ext4
[-- Attachment #1: Type: text/plain, Size: 6587 bytes --]
On Apr 7, 2020, at 12:46 AM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
>
> This patch adds shrinking support for htree based directories. The
> high level algorithm is as follows:
>
> * If after dentry removal the dirent block (let's call it B) becomes
> empty, then remove its references in its dx parent.
> * Swap its contents with that of the last block (L) in directory.
> * Update L's parents to point to B instead.
> * Remove L
> * Repeat this for all the ancestors of B.
>
> We add variants of dx_probe that allow us perform reverse lookups from
> a logical block to its dx parents.
>
> Ran kvm-xfstests smoke and verified that no new failures are
> introduced. Ran shrinking for directories with following number of
> files and then deleted files one by one:
> * 1000 (size before deletion 36K, after deletion 4K)
> * 10000 (size before deletion 196K, after deletion 4K)
> * 100000 (size before deletion 2.1M, after deletion 4K)
> * 200000 (size before deletion 4.2M, after deletion 4K)
>
> In all cases directory shrunk significantly. We fallback to linear
> directories if the directory becomes empty.
>
> But note that most of the shrinking happens during last 1-2% deletions
> in an average case. Therefore, the next step here is to merge dx nodes
> when possible. That can be achieved by storing the fullness index in
> htree nodes.
>
> Performance Testing:
> -------------------
>
> Created 1 million files and unlinked all of them. Did this with and without
> directory shrinking. Journalling was on. Used ftrace to measure time
> spent in ext4_unlink:
>
> * Without directory shrinking
>
> Size Before: 22M /mnt/1
> 1000000 files created and deleted.
> Average Total Elapsed Time (across 3 runs): 43.787s
> Size After: 22M /mnt/1
>
> useconds #count
> ------------------------
> 0-9 3690
> 10-19 2790131
> 20-29 179209
> 30-39 14270
> 40-49 7981
> 50-59 2212
> 60-69 1132
> 70-79 703
> 80-89 403
> 90-99 274
>
> Num samples > 40us: 12615
>
> * With directory shrinking
>
> Size Before: 22M /mnt/1
> 1000000 files created and deleted.
> Average Total Elapsed Time(across 3 runs): 44.230s
> Size After: 4.0K /mnt/1
>
> useconds #count
> -----------------------
> 0-9 3316
> 10-19 2786451
> 20-29 172015
> 30-39 13259
> 40-49 17847
> 50-59 4843
> 60-69 924
> 70-79 569
> 80-89 390
> 90-99 389
>
> Num samples > 40us: 24962
>
> We see doubled number of samples of >40us in case of directory shrinking.
> Because these constitute to < 1% of total samples, the overall effect of
> direcotry shrinking on unlink / rmdir performance is negligible. There
> was no notable difference in cpu usage.
>
> This patch supersedes the other directory shrinking patch sent in Aug
> 2019 ("ext4: attempt to shrink directory on dentry removal").
>
> Changes since V1:
> * ext4_remove_dx_entry(), dx_probe_dx_node() fixes
> * dx_probe_dirent_blk() continuation fix
> * Added performance evaluation
>
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Some *very* minor cleanups *iff* the patch is refreshed, but otherwise LGTM.
Thanks also for the good performance numbers, that makes it pretty clear
there is very little downside to enabling this all the time.
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
> ---
> fs/ext4/ext4.h | 3 +-
> fs/ext4/ext4_jbd2.h | 7 +
> fs/ext4/inline.c | 2 +-
> fs/ext4/namei.c | 357 ++++++++++++++++++++++++++++++++++++++++++--
> 4 files changed, 356 insertions(+), 13 deletions(-)
>
> @@ -820,7 +826,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
> dxtrace(printk("Look up %x", hash));
> while (1) {
> count = dx_get_count(entries);
> - if (!count || count > dx_get_limit(entries)) {
> + /* If we are called from the shrink path */
> + if (count > dx_get_limit(entries) ||
> + (strict && !count)) {
(minor) I was wondering if (strict && !count) should be kept first, since it
is a simpler check, but then realized this whole check should be marked
unlikely(), since it should never happen unless the filesystem is corrupted.
> + * This function tries to remove the entry of a dirent block (which was just
> + * emptied by the caller) from the dx frame. It does so by reducing the count by
(very minor) could move "by" to next line to balance line length... :-)
> + * 1 and left shifting all the entries after the deleted entry.
> + * ext4_remove_dx_entry() does NOT mark dx_frame dirty, that's left to the
> + * caller. The reason for doing this is that this function removes entry from
> + * a dx_node and can potentially leave dx_node with no entries. Since older
> + * kernels don't allow dx_parent nodes to have 0 count, the caller should
> + * only mark buffer dirty when we are sure that we won't leave dx_node
> + * with 0 count.
> + */
> +int
> +ext4_remove_dx_entry(handle_t *handle, struct inode *dir,
(style) normally the return type is on the same line as the function
declaration, unless (like dx_probe()) it is so long as to consume a
lot of space. That isn't the case here.
> +/*
> + * Checks if dirent block is empty. If de is passed, we
> + * skip directory entries before de in the block.
> + */
> +static inline bool is_empty_dirent_block(struct inode *dir,
> + struct buffer_head *bh,
> + struct ext4_dir_entry_2 *de)
> +{
> + if (!de)
> + de = (struct ext4_dir_entry_2 *)bh->b_data;
> + while ((u8 *)de < (u8 *)bh->b_data + dir->i_sb->s_blocksize) {
> + if (de->inode)
> + return false;
> + de = ext4_next_entry(de, dir->i_sb->s_blocksize);
> + }
> + return true;
> +}
> +
> @@ -2486,6 +2643,9 @@ int ext4_generic_delete_entry(handle_t *handle,
> blocksize);
> else
> de->inode = 0;
> + if (empty)
> + *empty = is_empty_dirent_block(dir, bh,
> + pde ? pde : de);
Nice. I was going to say that this shouldn't re-check the whole block,
but it doesn't do that. is_empty_dirent_block() only checks entries after
the removed entry, exits early if any non-empty dirent is found, and only
checks if we care whether the block is empty or not, and which is ideal.
Cheers, Andreas
[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2020-04-07 6:46 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Harshad Shirwadkar
@ 2020-04-07 17:21 ` Andreas Dilger
2020-04-08 9:31 ` Jan Kara
[not found] ` <c7a41ba13a3551fca25d7498b9d4542a104fac74.camel@gmail.com>
1 sibling, 1 reply; 13+ messages in thread
From: Andreas Dilger @ 2020-04-07 17:21 UTC (permalink / raw)
To: Harshad Shirwadkar, Jan Kara; +Cc: linux-ext4
[-- Attachment #1: Type: text/plain, Size: 2200 bytes --]
On Apr 7, 2020, at 12:46 AM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
>
> The new function added in this patchset adds an efficient way to
> check if a dirent block is empty. Based on that, reimplement
> ext4_empty_dir().
>
> This is a new patch added in V2.
>
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Again, a very minor style change iff patch is refreshed.
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
While looking at this code, I noticed that ext4_empty_dir() considers a
directory without a "." or ".." entry to be empty. I see this was changed
in 64d4ce8923 ("ext4: fix ext4_empty_dir() for directories with holes").
I can understand that we want to not die on corrupted leaf blocks, but it
isn't clear to me that it is a good idea to allow deleting an entire
directory tree if the first block has an error (missing "." or ".." as the
first and second entries) but is otherwise valid. There were definitely
bugs in the past that made "." or ".." not be the first and second entries.
That isn't a problem in this patch, but seems dangerous in general. It
doesn't seem like the directory shrinking would trigger this case, since it
is checking for individual empty blocks rather than using ext4_empty_dir().
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> @@ -3218,34 +3219,26 @@ bool ext4_empty_dir(struct inode *inode)
> brelse(bh);
> return true;
> }
> + de = ext4_next_entry(de, sb->s_blocksize);
> + if (!is_empty_dirent_block(inode, bh, de)) {
> + brelse(bh);
> + return false;
> + }
> + brelse(bh);
> +
> + for (lblk = 1; lblk < inode->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
> + lblk++) {
(style) should be aligned after '(' on previous line
> + bh = ext4_read_dirblock(inode, lblk, EITHER);
> + if (bh == NULL)
> continue;
> - }
> - if (le32_to_cpu(de->inode)) {
> + if (IS_ERR(bh))
> + return true;
> + if (!is_empty_dirent_block(inode, bh, NULL)) {
> brelse(bh);
> return false;
> }
> - offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
> + brelse(bh);
> }
> - brelse(bh);
> return true;
> }
Cheers, Andreas
[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2020-04-07 17:21 ` Andreas Dilger
@ 2020-04-08 9:31 ` Jan Kara
0 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2020-04-08 9:31 UTC (permalink / raw)
To: Andreas Dilger; +Cc: Harshad Shirwadkar, Jan Kara, linux-ext4
On Tue 07-04-20 11:21:01, Andreas Dilger wrote:
> While looking at this code, I noticed that ext4_empty_dir() considers a
> directory without a "." or ".." entry to be empty. I see this was changed
> in 64d4ce8923 ("ext4: fix ext4_empty_dir() for directories with holes").
> I can understand that we want to not die on corrupted leaf blocks, but it
> isn't clear to me that it is a good idea to allow deleting an entire
> directory tree if the first block has an error (missing "." or ".." as the
> first and second entries) but is otherwise valid. There were definitely
> bugs in the past that made "." or ".." not be the first and second entries.
That's a good question. I'd just say that ext4_empty_dir() generally
returns true when there's some problem with the directory. In commit
64d4ce8923 I just followed that convention. This behavior of ext4_empty_dir()
(and empty_dir() before in ext3) dates back at least to the beginning of
git history... I guess we could err on the safer side and disallow
directory deletion if there is any problem with it but I guess there was
some motivation for this behavior in the past? Maybe somebody remembers?
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
[not found] ` <c7a41ba13a3551fca25d7498b9d4542a104fac74.camel@gmail.com>
@ 2025-09-27 8:11 ` Julian Sun
2025-09-28 3:46 ` Theodore Ts'o
0 siblings, 1 reply; 13+ messages in thread
From: Julian Sun @ 2025-09-27 8:11 UTC (permalink / raw)
To: Harshad Shirwadkar; +Cc: adilger, jack, tytso, Ext4 Developers List
On Sat, Sep 27, 2025 at 4:05 PM Julian Sun <sunjunchao2870@gmail.com> wrote:
>
> On Mon, 2020-04-06 at 23:46 -0700, Harshad Shirwadkar wrote:
> > The new function added in this patchset adds an efficient way to
> > check if a dirent block is empty. Based on that, reimplement
> > ext4_empty_dir().
> >
> > This is a new patch added in V2.
> >
> > Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> > Reviewed-by: Andreas Dilger <adilger@dilger.ca>
> > ---
> > fs/ext4/namei.c | 39 ++++++++++++++++-----------------------
> > 1 file changed, 16 insertions(+), 23 deletions(-)
> >
> > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> > index 39360c442dad..dae7d15fba5c 100644
> > --- a/fs/ext4/namei.c
> > +++ b/fs/ext4/namei.c
> > @@ -3179,6 +3179,7 @@ bool ext4_empty_dir(struct inode *inode)
> > struct buffer_head *bh;
> > struct ext4_dir_entry_2 *de;
> > struct super_block *sb;
> > + ext4_lblk_t lblk;
> >
> > if (ext4_has_inline_data(inode)) {
> > int has_inline_data = 1;
> > @@ -3218,34 +3219,26 @@ bool ext4_empty_dir(struct inode *inode)
> > brelse(bh);
> > return true;
> > }
> > - offset += ext4_rec_len_from_disk(de->rec_len, sb-
> > > s_blocksize);
> > - while (offset < inode->i_size) {
> > - if (!(offset & (sb->s_blocksize - 1))) {
> > - unsigned int lblock;
> > - brelse(bh);
> > - lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
> > - bh = ext4_read_dirblock(inode, lblock,
> > EITHER);
> > - if (bh == NULL) {
> > - offset += sb->s_blocksize;
> > - continue;
> > - }
> > - if (IS_ERR(bh))
> > - return true;
> > - }
> > - de = (struct ext4_dir_entry_2 *) (bh->b_data +
> > - (offset & (sb->s_blocksize -
> > 1)));
> > - if (ext4_check_dir_entry(inode, NULL, de, bh,
> > - bh->b_data, bh->b_size,
> > offset)) {
> > - offset = (offset | (sb->s_blocksize - 1)) +
> > 1;
> > + de = ext4_next_entry(de, sb->s_blocksize);
> > + if (!is_empty_dirent_block(inode, bh, de)) {
> > + brelse(bh);
> > + return false;
> > + }
> > + brelse(bh);
> > +
> > + for (lblk = 1; lblk < inode->i_size >>
> > EXT4_BLOCK_SIZE_BITS(sb);
> > + lblk++) {
> > + bh = ext4_read_dirblock(inode, lblk, EITHER);
> > + if (bh == NULL)
> > continue;
> > - }
> > - if (le32_to_cpu(de->inode)) {
> > + if (IS_ERR(bh))
> > + return true;
> > + if (!is_empty_dirent_block(inode, bh, NULL)) {
> > brelse(bh);
> > return false;
> > }
> > - offset += ext4_rec_len_from_disk(de->rec_len, sb-
> > > s_blocksize);
> > + brelse(bh);
> > }
> > - brelse(bh);
> > return true;
> > }
> >
>
> Hi,
>
> I’ve recently been looking into the ext4 directory shrinking problem
> and was considering trying to add this feature myself. To my surprise,
> I found that this patch set had already implemented it and even
> received Reviewed-by. I’m curious whether it was never merged, or if it
> was merged and later reverted?
>
> If possible, is there anything I could do to contribute to moving this
> patch set forward toward being merged?
cc ext4 mail list.
>
> Thanks.
>
>
--
Julian Sun <sunjunchao2870@gmail.com>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2025-09-27 8:11 ` Julian Sun
@ 2025-09-28 3:46 ` Theodore Ts'o
2025-09-28 6:51 ` Julian Sun
0 siblings, 1 reply; 13+ messages in thread
From: Theodore Ts'o @ 2025-09-28 3:46 UTC (permalink / raw)
To: Julian Sun; +Cc: Harshad Shirwadkar, adilger, jack, Ext4 Developers List
On Sat, Sep 27, 2025 at 04:11:54PM +0800, Julian Sun wrote:
> >
> > I’ve recently been looking into the ext4 directory shrinking problem
> > and was considering trying to add this feature myself. To my surprise,
> > I found that this patch set had already implemented it and even
> > received Reviewed-by. I’m curious whether it was never merged, or if it
> > was merged and later reverted?
> >
> > If possible, is there anything I could do to contribute to moving this
> > patch set forward toward being merged?
I *think* there was one or two test regressions that Harshad was
wrking on, but real problem was the original business imperative for
the project became no longer as compelling, and we moved to focus on
other priorities.
So if you'd like tocontribute to moving this forward, what we'd need
to do is to forward port the patch set to the latest kernel. I've
taken a quick look at the patches, and predates the addition of the
support of 3-level htrees (the incompat_largedir feature). There are
also some hardening against maliciously fuzzed file systems that will
prevent the patches from applying cleanly.
Then we'd need to run regression tests on a variety of different ext4
configurations to see if there are any regressions, and if so, they
would need to be fixed.
Also, please note that this first set of changes doesn't really make a
big difference for real-world use casses, since a directory block
won't get dropped when it is completely empty. For example, if we
assume an average directory entry size of 32, there can be up to 128
entries in a 4k block. If we assume that the average leaf block is
75% filled, there will be 96 directory entries. All 96 directory
entries have to be deleted before that block can be removed. If the
directory is 4MB, there will be roughly 100,000 directory entries and
1024 blocks. If we assume a random distribution and random deletion
(which is a fair assumption given that we're using a hash of the file
name). I will leave it as an exercise to the reader what percentage
of directory entries need to be deleted before the probability that at
least one 4k directory block is emptied is at least, say, 80%. But in
practice, you have to delete most of the files in the directory before
the directory starts shrinking.
So this this is why we really need to implement the next step (which
is not in this patch series), and that is to merging two adjacent leaf
blocks once they fall below to some threshold --- say, 25%. We will
also need to merge two adjacent index nodes if they are mostly empty.
Cheers,
- Ted
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2025-09-28 3:46 ` Theodore Ts'o
@ 2025-09-28 6:51 ` Julian Sun
2025-09-28 21:02 ` Theodore Ts'o
0 siblings, 1 reply; 13+ messages in thread
From: Julian Sun @ 2025-09-28 6:51 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: Harshad Shirwadkar, adilger, jack, Ext4 Developers List
Hi,
On Sun, Sep 28, 2025 at 11:46 AM Theodore Ts'o <tytso@mit.edu> wrote:
>
> On Sat, Sep 27, 2025 at 04:11:54PM +0800, Julian Sun wrote:
> > >
> > > I’ve recently been looking into the ext4 directory shrinking problem
> > > and was considering trying to add this feature myself. To my surprise,
> > > I found that this patch set had already implemented it and even
> > > received Reviewed-by. I’m curious whether it was never merged, or if it
> > > was merged and later reverted?
> > >
> > > If possible, is there anything I could do to contribute to moving this
> > > patch set forward toward being merged?
>
> I *think* there was one or two test regressions that Harshad was
> wrking on, but real problem was the original business imperative for
> the project became no longer as compelling, and we moved to focus on
> other priorities.
>
> So if you'd like tocontribute to moving this forward, what we'd need
> to do is to forward port the patch set to the latest kernel. I've
> taken a quick look at the patches, and predates the addition of the
> support of 3-level htrees (the incompat_largedir feature).
Emm. I checked the code and found that support for 3-level htrees was
added in 2017 via commit e08ac99fa2a2 ("ext4: add largedir feature"),
but this patch was submitted in 2020. Did I make a mistake somewhere?
> There are
> also some hardening against maliciously fuzzed file systems that will
> prevent the patches from applying cleanly.
Is this included in the xfstests test suite?
> Then we'd need to run regression tests on a variety of different ext4
> configurations to see if there are any regressions, and if so, they
> would need to be fixed.
Is testing with xfstests sufficient? Or are there any other test
suites that can be used to test this patch set?
>
> Also, please note that this first set of changes doesn't really make a
> big difference for real-world use casses, since a directory block
> won't get dropped when it is completely empty. For example, if we
> assume an average directory entry size of 32, there can be up to 128
> entries in a 4k block. If we assume that the average leaf block is
> 75% filled, there will be 96 directory entries. All 96 directory
> entries have to be deleted before that block can be removed. If the
> directory is 4MB, there will be roughly 100,000 directory entries and
> 1024 blocks. If we assume a random distribution and random deletion
> (which is a fair assumption given that we're using a hash of the file
> name). I will leave it as an exercise to the reader what percentage
> of directory entries need to be deleted before the probability that at
> least one 4k directory block is emptied is at least, say, 80%. But in
> practice, you have to delete most of the files in the directory before
> the directory starts shrinking.
Yes, I think the biggest beneficiary is rm -rf-type workloads.
>
> So this this is why we really need to implement the next step (which
> is not in this patch series), and that is to merging two adjacent leaf
> blocks once they fall below to some threshold --- say, 25%. We will
> also need to merge two adjacent index nodes if they are mostly empty.
Sounds great. Thanks for your kind and detailed explanation, Ted.
>
> Cheers,
>
> - Ted
Thanks,
--
Julian Sun <sunjunchao2870@gmail.com>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2025-09-28 6:51 ` Julian Sun
@ 2025-09-28 21:02 ` Theodore Ts'o
2025-09-28 21:23 ` [PATCH 1/2] ext4: return lblk from ext4_find_entry Theodore Ts'o
2025-09-29 2:33 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Julian Sun
0 siblings, 2 replies; 13+ messages in thread
From: Theodore Ts'o @ 2025-09-28 21:02 UTC (permalink / raw)
To: Julian Sun; +Cc: Harshad Shirwadkar, adilger, jack, Ext4 Developers List
On Sun, Sep 28, 2025 at 02:51:09PM +0800, Julian Sun wrote:
> Emm. I checked the code and found that support for 3-level htrees was
> added in 2017 via commit e08ac99fa2a2 ("ext4: add largedir feature"),
> but this patch was submitted in 2020. Did I make a mistake somewhere?
I made that statement when I tried forward porting Harshad's patches
to 6.17-rc4, and one of the patch conflicts seemed to implyyy that
dx_probe predated the3-level htree change. But I could have been
mistaken about why the patch conflict eixsts.
> > There are
> > also some hardening against maliciously fuzzed file systems that will
> > prevent the patches from applying cleanly.
>
> Is this included in the xfstests test suite?
For better or for worse, no. The reason for this is that xfstests
tends to avoid using pre-generated file systems becase they aren't
portable to different file system types. And in order to test
deliberately corrupted, you generally need to include small corrupted
file systems into xfstests. You *could* try to create them as an
ext4-specific test using debugfs to introduce the corruption, but
that's quite painful, and in some cases the only way to corrupt the
file system in that very specific way would require a hex editor.
That being said, it's generally pretty obvious when bullet-proofing
code has been added and it causes a merge conflict. For example, I
skipped forward-porting the third patch, "ext4: reimplement
ext4_empty_dir() using is_dirent_block_empty" because there was some
bullet-proofing code added in ext4_empty_dir() which caused the patch
application to fail, and I was too third try to deal with it, and it
strictly speaking wasn't necessary for the patch shrinking
functionality. That being said, the bullet proofing which was added
to ext4_empty_dir(), *should* be added to the newly created
i_dirent_block_empty() function introduced in the 2/3 patch in this
series.
> > Then we'd need to run regression tests on a variety of different ext4
> > configurations to see if there are any regressions, and if so, they
> > would need to be fixed.
>
> Is testing with xfstests sufficient? Or are there any other test
> suites that can be used to test this patch set?
Testing with xfstests are sufficient, but we need to test multiple
file system configurations, but just the default "ext4 using a 4k
blocksize". So I tried to do a quick-and-dirty forward port of the
patches, and they compiled and passed the 15-20 minute smoke test
(running "kvm-xfstests smoke", or "gce-xfstests smoke"). But when I
ran the full set of tests, this is what I found.
ext4/4k: 594 tests, 4 failures, 65 skipped, 5196 seconds
Failures: generic/426 generic/756
Flaky: generic/041: 20% (1/5) generic/049: 20% (1/5)
ext4/1k: 588 tests, 3 failures, 69 skipped, 6306 seconds
Failures: generic/426 generic/756
Flaky: generic/043: 20% (1/5)
ext4/ext3: 586 tests, 3 failures, 149 skipped, 4958 seconds
Flaky: generic/041: 40% (2/5) generic/426: 60% (3/5)
generic/756: 60% (3/5)
ext4/encrypt: 569 tests, 3 failures, 181 skipped, 3263 seconds
Failures: generic/426 generic/756
Flaky: generic/049: 40% (2/5)
ext4/nojournal: 586 tests, 3 failures, 137 skipped, 3918 seconds
Failures: ext4/045 generic/426 generic/756
ext4/ext3conv: 591 tests, 4 failures, 67 skipped, 5408 seconds
Failures: generic/426 generic/756
Flaky: ext4/045: 40% (2/5) generic/040: 60% (3/5)
ext4/adv: 587 tests, 10 failures, 73 skipped, 5487 seconds
Failures: [generic/347] generic/426 generic/756 [generic/757] [generic/764]
[generic/777]
Flaky: generic/047: 40% (2/5) generic/475: 20% (1/5)
[generic/482: 40% (2/5)] generic/547: 20% (1/5)
ext4/dioread_nolock: 592 tests, 3 failures, 65 skipped, 4996 seconds
Failures: generic/426 generic/756
Flaky: generic/047: 40% (2/5)
ext4/data_journal: 587 tests, 6 failures, 141 skipped, 5151 seconds
Failures: [generic/127] generic/426 generic/756
Flaky: generic/049: 60% (3/5) generic/475: 60% (3/5)
generic/645: 40% (2/5)
ext4/bigalloc_4k: 565 tests, 4 failures, 68 skipped, 5066 seconds
Failures: ext4/045
Flaky: generic/320: 60% (3/5) generic/426: 60% (3/5)
generic/756: 60% (3/5)
ext4/bigalloc_1k: 566 tests, 3 failures, 79 skipped, 5096 seconds
Failures: generic/426 generic/756
Flaky: generic/049: 20% (1/5)
ext4/dax: 578 tests, 5 failures, 170 skipped, 3198 seconds
Failures: [generic/344] [generic/363] generic/426 generic/756
Flaky: generic/043: 20% (1/5)
Totals: 7193 tests, 1264 skipped, 190 failures, 0 errors, 53862s
(Tests in square brackets were failing with stock 6.17-rc4, so don't
represent regressions. Ignore them for the purposes of trying to fix
up this patch set.)
Now, some of thsee may be failures caused by my making a mistake when
doing the forward port. Basically, I bashed the patches until they
built; and then I ran the smoke test; and then I kicked of the full
test run, which takes about 2.5 hours using a dozen VM's.
I don't have time to take this any further, so with your permission,
if you're OK with my handing fiishing off this project to you, I'll
send the patches as a reply to this message, and then "Your mission,
should you use to accept it" (quoting from the "Mission Impossible" TV
Show / Movie) would be:
1) Investigate the failures deailed above, and fix them. Again, some
of these failures may not be Harshads; it could be mine when I did the
forward port. Either way, we need to fix them before we can merge the
changes upstream.
2) Do more cleanups on the patches as necessary. While doing the
forward port, I noted the following issues that should really be
fixed. You might find more things that need improvement; if so,
please go for it!
2a) Although we are potentially modifying more metadata blocks in
ext4_try_dir_shrink(), the number of journal credits requested by
callers to ext4_delete_entry() were not increased. This could result
in the handle running out of credits, which will trigger a warning,
and if the transaction runs out of space, could trigger a journal
abort. One way would simply be to increase the credits passed to
ext4_journal_start().
The other way would be to queue up the work and do the directory
shrinking in workqueue, where the changes would be done in a
completely separate journal handle. This has the advantage that it
avoids increasing the latency to the unlink() system call, since there
is no reason why the change needs to happen as part of the system
call, or in the same transaction as the unlink.
2b) Error checking is missing in some of the newly added code. In
particular the function calls in make_unindexed() are ignoring error
returns. More generally, while we do the directory shrnk, we need to
check for potential problems; and if there are any failures, we need
to leave the directory unchanged, possibly calling ext4_error() if
file system corruption is dicovered, or just skipping the directory
shirnk operation if it is some transient error --- for example, a
memory allocation failure.
2c) As mentioned above, I skipped forward porting the 3rd patch
series, and there is code to rigorously check for inconsistent file
system corruptions lest we trigger a BUG or WARN or worse which is
causing the patch application to fail. That checking needs to be
added to is_dirent_block_empty(), and while you're at it, please check
all of the newly added code to make sure it won't misbehave if the
file system structures have been corrupted in a particualrly inventive
way.
2d) Once (2c) is done forward port patch #3.
3) As discussed in this thread, once these patches are landed, the
further work to merge leaf notes; to remove empty htree nodes; and to
shorten the htree depth when necessary.
> > Also, please note that this first set of changes doesn't really make a
> > big difference for real-world use casses, since a directory block
> > won't get dropped when it is completely empty....
>
> Yes, I think the biggest beneficiary is rm -rf-type workloads.
The thing about rm -rf workloads is that if you eventually rmdir() the
directory, all of the blocks will be released anyway. That's probably
why this feature is one that hasn't been high priority. If you delete
75% of the files in a directory, you *could* do something like "mkdir
foo.new ; mv foo/* foo.new ; rmdir foo ; mv foo.new foo", but of
course that won't work if some other process is actively accessing the
files when you are trying to do the "DIY directory shrink operation".
Cheers,
- Ted
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 1/2] ext4: return lblk from ext4_find_entry
2025-09-28 21:02 ` Theodore Ts'o
@ 2025-09-28 21:23 ` Theodore Ts'o
2025-09-28 21:23 ` [PATCH 2/2] ext4: shrink directories on dentry delete Theodore Ts'o
2025-09-29 2:33 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Julian Sun
1 sibling, 1 reply; 13+ messages in thread
From: Theodore Ts'o @ 2025-09-28 21:23 UTC (permalink / raw)
To: Ext4 Developers List
Cc: sunjunchao2870, Harshad Shirwadkar, Andreas Dilger,
Theodore Ts'o
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
This patch makes ext4_find_entry and related routines to return
logical block address of the dirent block. This logical block address
is used in the directory shrinking code to perform reverse lookup and
verify that the lookup was successful.
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Message-ID: <20200407064616.221459-1-harshadshirwadkar@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/ext4/namei.c | 55 ++++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2cd36f59c9e3..9a6d8b87492f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -293,7 +293,7 @@ struct dx_tail {
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
- struct ext4_dir_entry_2 **res_dir);
+ struct ext4_dir_entry_2 **res_dir, ext4_lblk_t *lblk);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -1517,7 +1517,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
static struct buffer_head *__ext4_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir,
- int *inlined)
+ int *inlined, ext4_lblk_t *lblk)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1558,7 +1558,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
goto restart;
}
if (is_dx(dir)) {
- ret = ext4_dx_find_entry(dir, fname, res_dir);
+ ret = ext4_dx_find_entry(dir, fname, res_dir, lblk);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
@@ -1668,7 +1668,8 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
static struct buffer_head *ext4_find_entry(struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
- int *inlined)
+ int *inlined,
+ ext4_lblk_t *lblk)
{
int err;
struct ext4_filename fname;
@@ -1680,7 +1681,7 @@ static struct buffer_head *ext4_find_entry(struct inode *dir,
if (err)
return ERR_PTR(err);
- bh = __ext4_find_entry(dir, &fname, res_dir, inlined);
+ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lblk);
ext4_fname_free_filename(&fname);
return bh;
@@ -1700,7 +1701,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
if (err)
return ERR_PTR(err);
- bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
+ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL);
ext4_fname_free_filename(&fname);
return bh;
@@ -1708,7 +1709,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
- struct ext4_dir_entry_2 **res_dir)
+ struct ext4_dir_entry_2 **res_dir,
+ ext4_lblk_t *lblk)
{
struct super_block * sb = dir->i_sb;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
@@ -1755,6 +1757,8 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
errout:
dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name));
success:
+ if (lblk)
+ *lblk = block;
dx_release(frames);
return bh;
}
@@ -1821,7 +1825,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
+ bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL, NULL);
if (IS_ERR(bh))
return ERR_CAST(bh);
if (!bh)
@@ -2702,7 +2706,7 @@ int ext4_generic_delete_entry(struct inode *dir,
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh)
+ struct buffer_head *bh, ext4_lblk_t lblk)
{
int err, csum_size = 0;
@@ -3135,6 +3139,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
+ ext4_lblk_t lblk;
retval = ext4_emergency_state(dir->i_sb);
if (unlikely(retval))
@@ -3150,7 +3155,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return retval;
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL, &lblk);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
@@ -3177,7 +3182,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, lblk);
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
@@ -3227,12 +3232,13 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
struct ext4_dir_entry_2 *de;
handle_t *handle;
int skip_remove_dentry = 0;
+ ext4_lblk_t lblk;
/*
* Keep this outside the transaction; it may have to set up the
* directory's encryption key, which isn't GFP_NOFS-safe.
*/
- bh = ext4_find_entry(dir, d_name, &de, NULL);
+ bh = ext4_find_entry(dir, d_name, &de, NULL, &lblk);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -3262,7 +3268,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
ext4_handle_sync(handle);
if (!skip_remove_dentry) {
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, lblk);
if (retval)
goto out_handle;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -3669,7 +3675,7 @@ static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
* before reset old inode info.
*/
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
- &old.inlined);
+ &old.inlined, NULL);
if (IS_ERR(old.bh))
retval = PTR_ERR(old.bh);
if (!old.bh)
@@ -3689,19 +3695,20 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ ext4_lblk_t lblk;
- bh = ext4_find_entry(dir, d_name, &de, NULL);
+ bh = ext4_find_entry(dir, d_name, &de, NULL, &lblk);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (bh) {
- retval = ext4_delete_entry(handle, dir, de, bh);
+ retval = ext4_delete_entry(handle, dir, de, bh, lblk);
brelse(bh);
}
return retval;
}
static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
- int force_reread)
+ int force_reread, ext4_lblk_t lblk)
{
int retval;
/*
@@ -3718,7 +3725,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
} else {
- retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+ retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh,
+ lblk);
if (retval == -ENOENT) {
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
@@ -3806,6 +3814,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct inode *whiteout = NULL;
int credits;
u8 old_file_type;
+ ext4_lblk_t lblk;
if (new.inode && new.inode->i_nlink == 0) {
EXT4_ERROR_INODE(new.inode,
@@ -3837,7 +3846,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
- &old.inlined);
+ &old.inlined, &lblk);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
@@ -3852,7 +3861,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto release_bh;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
- &new.de, &new.inlined);
+ &new.de, &new.inlined, NULL);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
@@ -3952,7 +3961,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
/*
* ok, that's it
*/
- ext4_rename_delete(handle, &old, force_reread);
+ ext4_rename_delete(handle, &old, force_reread, lblk);
}
if (new.inode) {
@@ -4073,7 +4082,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return retval;
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
- &old.de, &old.inlined);
+ &old.de, &old.inlined, NULL);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
/*
@@ -4087,7 +4096,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
- &new.de, &new.inlined);
+ &new.de, &new.inlined, NULL);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 2/2] ext4: shrink directories on dentry delete
2025-09-28 21:23 ` [PATCH 1/2] ext4: return lblk from ext4_find_entry Theodore Ts'o
@ 2025-09-28 21:23 ` Theodore Ts'o
0 siblings, 0 replies; 13+ messages in thread
From: Theodore Ts'o @ 2025-09-28 21:23 UTC (permalink / raw)
To: Ext4 Developers List
Cc: sunjunchao2870, Harshad Shirwadkar, Andreas Dilger,
Theodore Ts'o
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
This patch adds shrinking support for htree based directories. The
high level algorithm is as follows:
* If after dentry removal the dirent block (let's call it B) becomes
empty, then remove its references in its dx parent.
* Swap its contents with that of the last block (L) in directory.
* Update L's parents to point to B instead.
* Remove L
* Repeat this for all the ancestors of B.
We add variants of dx_probe that allow us perform reverse lookups from
a logical block to its dx parents.
Ran kvm-xfstests smoke and verified that no new failures are
introduced. Ran shrinking for directories with following number of
files and then deleted files one by one:
* 1000 (size before deletion 36K, after deletion 4K)
* 10000 (size before deletion 196K, after deletion 4K)
* 100000 (size before deletion 2.1M, after deletion 4K)
* 200000 (size before deletion 4.2M, after deletion 4K)
In all cases directory shrunk significantly. We fallback to linear
directories if the directory becomes empty.
But note that most of the shrinking happens during last 1-2% deletions
in an average case. Therefore, the next step here is to merge dx nodes
when possible. That can be achieved by storing the fullness index in
htree nodes.
Performance Testing:
-------------------
Created 1 million files and unlinked all of them. Did this with and without
directory shrinking. Journalling was on. Used ftrace to measure time
spent in ext4_unlink:
* Without directory shrinking
Size Before: 22M /mnt/1
1000000 files created and deleted.
Average Total Elapsed Time (across 3 runs): 43.787s
Size After: 22M /mnt/1
useconds #count
------------------------
0-9 3690
10-19 2790131
20-29 179209
30-39 14270
40-49 7981
50-59 2212
60-69 1132
70-79 703
80-89 403
90-99 274
Num samples > 40us: 12615
* With directory shrinking
Size Before: 22M /mnt/1
1000000 files created and deleted.
Average Total Elapsed Time(across 3 runs): 44.230s
Size After: 4.0K /mnt/1
useconds #count
-----------------------
0-9 3316
10-19 2786451
20-29 172015
30-39 13259
40-49 17847
50-59 4843
60-69 924
70-79 569
80-89 390
90-99 389
Num samples > 40us: 24962
We see doubled number of samples of >40us in case of directory shrinking.
Because these constitute to < 1% of total samples, the overall effect of
direcotry shrinking on unlink / rmdir performance is negligible. There
was no notable difference in cpu usage.
This patch supersedes the other directory shrinking patch sent in Aug
2019 ("ext4: attempt to shrink directory on dentry removal").
Changes since V1:
* ext4_remove_dx_entry(), dx_probe_dx_node() fixes
* dx_probe_dirent_blk() continuation fix
* Added performance evaluation
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Message-ID: <20200407064616.221459-2-harshadshirwadkar@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
fs/ext4/ext4.h | 3 +-
fs/ext4/ext4_jbd2.h | 7 +
fs/ext4/inline.c | 2 +-
fs/ext4/namei.c | 355 ++++++++++++++++++++++++++++++++++++++++++--
4 files changed, 355 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01a6e2de7fc3..17f3478eea35 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3114,7 +3114,8 @@ extern int ext4_generic_delete_entry(struct inode *dir,
struct buffer_head *bh,
void *entry_buf,
int buf_size,
- int csum_size);
+ int csum_size,
+ bool *empty);
extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 63d17c5201b5..d4348de0e6e6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -83,6 +83,13 @@
*/
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
+/*
+ * Number of credits needed to perform directory shrinking. For each htree
+ * level, we need 6 blocks (block in question, its parent, last block, last
+ * block's parent, bitmap block, bg summary).
+ */
+#define EXT4_DIR_SHRINK_TRANS_BLOCKS 6U
+
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1b094a4f3866..96c30763be47 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1673,7 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle,
goto out;
err = ext4_generic_delete_entry(dir, de_del, bh,
- inline_start, inline_size, 0);
+ inline_start, inline_size, 0, NULL);
if (err)
goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9a6d8b87492f..f8f5e7329afc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -296,6 +296,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir, ext4_lblk_t *lblk);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
+static void dx_release(struct dx_frame *frames);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash);
/* checksumming functions */
void ext4_initialize_dirent_tail(struct buffer_head *bh,
@@ -768,15 +773,19 @@ static inline void htree_rep_invariant_check(struct dx_entry *at,
/*
* Probe for a directory leaf block to search.
*
- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * __dx_probe can return ERR_BAD_DX_DIR, which means there was a format
* error in the directory index, and the caller should fall back to
- * searching the directory normally. The callers of dx_probe **MUST**
+ * searching the directory normally. The callers of __dx_probe **MUST**
* check for this error code, and make sure it never gets reflected
- * back to userspace.
+ * back to userspace. If lblk_end is set and if __dx_probe hits lblk_end
+ * logical block, it reads it and stops. If strict is true, __dx_probe
+ * performs stronger checks for "count" during the lookup. See other
+ * variants below.
*/
static struct dx_frame *
-dx_probe(struct ext4_filename *fname, struct inode *dir,
- struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+__dx_probe(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in,
+ ext4_lblk_t lblk_end, bool strict)
{
unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
@@ -867,7 +876,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
- if (!count || count > dx_get_limit(entries)) {
+ /* If we are called from the shrink path */
+ if (count > dx_get_limit(entries) ||
+ (strict && !count)) {
ext4_warning_inode(dir,
"dx entry: count %u beyond limit %u",
count, dx_get_limit(entries));
@@ -903,7 +914,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
goto fail;
}
}
- if (++level > indirect)
+ if (++level > indirect || dx_get_block(at) == lblk_end)
return frame;
blocks[level] = block;
frame++;
@@ -935,6 +946,139 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
return ret_err;
}
+static inline struct dx_frame *
+dx_probe(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+ return __dx_probe(fname, dir, hinfo, frame_in, 0, true);
+}
+
+/*
+ * dx_probe with relaxed checks. This function is used in the directory
+ * shrinking code since we can run into intermediate states where we have
+ * internal dx nodes with count = 0.
+ */
+static inline struct dx_frame *
+dx_probe_relaxed(struct ext4_filename *fname, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in)
+{
+ return __dx_probe(fname, dir, hinfo, frame_in, 0, false);
+}
+
+/* dx_probe() variant that allows us to lookup frames for a dirent block. */
+static struct dx_frame *dx_probe_dirent_blk(struct inode *dir,
+ struct dx_frame *frames,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct dx_hash_info hinfo;
+ struct dx_frame *frame_ptr;
+ struct ext4_dir_entry_2 *dead_de;
+
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+
+ /* Let's assume that the lblk is a leaf dirent block */
+ dead_de = (struct ext4_dir_entry_2 *)bh->b_data;
+ ext4fs_dirhash(dir, dead_de->name, dead_de->name_len, &hinfo);
+
+ frame_ptr = dx_probe_relaxed(NULL, dir, &hinfo, frames);
+ if (!IS_ERR(frame_ptr)) {
+ /*
+ * See if we need to move our frame_ptr.
+ */
+ while (dx_get_block(frame_ptr->at) != lblk) {
+ if (!ext4_htree_next_block(dir, hinfo.hash, frame_ptr,
+ frames, NULL))
+ break;
+ }
+ if (dx_get_block(frame_ptr->at) == lblk)
+ return frame_ptr;
+ dx_release(frames);
+ frame_ptr = ERR_PTR(-EINVAL);
+ }
+ return frame_ptr;
+}
+
+/* dx_probe() variant that allows us to lookup frames for a dx node. */
+static struct dx_frame *dx_probe_dx_node(struct inode *dir,
+ struct dx_frame *frames,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
+{
+ struct dx_hash_info hinfo;
+ struct dx_frame *frame_ptr;
+ struct dx_entry *entries;
+
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ entries = ((struct dx_node *)(bh->b_data))->entries;
+
+ /* Use the first hash found in the dx node */
+ hinfo.hash = le32_to_cpu(entries[1].hash);
+ frame_ptr = __dx_probe(NULL, dir, &hinfo, frames, lblk, false);
+ if (IS_ERR_OR_NULL(frame_ptr))
+ return frame_ptr;
+ if (dx_get_block(frame_ptr->at) != lblk) {
+ dx_release(frames);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return frame_ptr;
+}
+
+/*
+ * This function tries to remove the entry of a dirent block (which was just
+ * emptied by the caller) from the dx frame. It does so by reducing the count by
+ * 1 and left shifting all the entries after the deleted entry.
+ * ext4_remove_dx_entry() does NOT mark dx_frame dirty, that's left to the
+ * caller. The reason for doing this is that this function removes entry from
+ * a dx_node and can potentially leave dx_node with no entries. Since older
+ * kernels don't allow dx_parent nodes to have 0 count, the caller should
+ * only mark buffer dirty when we are sure that we won't leave dx_node
+ * with 0 count.
+ */
+static int
+ext4_remove_dx_entry(handle_t *handle, struct inode *dir,
+ struct dx_frame *dx_frame)
+{
+ struct dx_entry *entries = dx_frame->entries;
+ int err, i = 0;
+ unsigned int count = dx_get_count(entries);
+
+ for (i = 0; i < count; i++)
+ if (entries[i].block == dx_frame->at->block)
+ break;
+ if (i >= count)
+ return -EINVAL;
+
+ err = ext4_journal_get_write_access(handle, dir->i_sb, dx_frame->bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out_err;
+
+ if (i == 0) {
+ entries[0].block = entries[1].block;
+ i++;
+ }
+ if (i < count - 1)
+ memmove(&entries[i], &entries[i + 1], (count - i - 1) *
+ sizeof(struct dx_entry));
+ dx_set_count(entries, count - 1);
+
+ return 0;
+out_err:
+ ext4_std_error(dir->i_sb, err);
+ return -EINVAL;
+
+}
+
static void dx_release(struct dx_frame *frames)
{
struct dx_root_info *info;
@@ -2649,6 +2793,24 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
return err;
}
+/*
+ * Checks if dirent block is empty. If de is passed, we
+ * skip directory entries before de in the block.
+ */
+static inline bool is_empty_dirent_block(struct inode *dir,
+ struct buffer_head *bh,
+ struct ext4_dir_entry_2 *de)
+{
+ if (!de)
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ while ((u8 *)de < (u8 *)bh->b_data + dir->i_sb->s_blocksize) {
+ if (de->inode)
+ return false;
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ }
+ return true;
+}
+
/*
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
@@ -2658,7 +2820,8 @@ int ext4_generic_delete_entry(struct inode *dir,
struct buffer_head *bh,
void *entry_buf,
int buf_size,
- int csum_size)
+ int csum_size,
+ bool *empty)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
@@ -2667,6 +2830,8 @@ int ext4_generic_delete_entry(struct inode *dir,
i = 0;
pde = NULL;
de = entry_buf;
+ if (empty)
+ *empty = false;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
entry_buf, buf_size, i))
@@ -2692,7 +2857,9 @@ int ext4_generic_delete_entry(struct inode *dir,
offsetof(struct ext4_dir_entry_2,
name_len));
}
-
+ if (empty)
+ *empty = is_empty_dirent_block(dir, bh,
+ pde ? pde : de);
inode_inc_iversion(dir);
return 0;
}
@@ -2703,12 +2870,176 @@ int ext4_generic_delete_entry(struct inode *dir,
return -ENOENT;
}
+static void make_unindexed(handle_t *handle, struct inode *dir,
+ struct buffer_head *bh)
+{
+ int parent_ino = le32_to_cpu(
+ ((struct dx_root *)bh->b_data)->dotdot.inode);
+
+ memset(bh->b_data, 0, dir->i_sb->s_blocksize);
+ ext4_init_dirblock(handle, dir, bh, parent_ino, NULL, 0);
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+ ext4_mark_inode_dirty(handle, dir);
+}
+
+/*
+ * Copy contents from lblk_src to lblk_dst and remap internal dx nodes
+ * such that parent of lblk_src points to lblk_dst.
+ */
+static int ext4_dx_remap_block(handle_t *handle, struct inode *dir,
+ struct buffer_head *bh_dst,
+ ext4_lblk_t lblk_dst, ext4_lblk_t lblk_src)
+{
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame_ptr;
+ struct buffer_head *bh_src;
+ int ret;
+
+ bh_src = ext4_bread(handle, dir, lblk_src, 0);
+ if (IS_ERR_OR_NULL(bh_src))
+ return -EIO;
+
+ memset(frames, 0, sizeof(frames));
+
+ /*
+ * blk_src can either be a dirent block or dx node. Try to search
+ * using both and use whichever one that returns success.
+ */
+ frame_ptr = dx_probe_dirent_blk(dir, frames, bh_src, lblk_src);
+ if (IS_ERR_OR_NULL(frame_ptr)) {
+ frame_ptr = dx_probe_dx_node(dir, frames, bh_src, lblk_src);
+ if (IS_ERR_OR_NULL(frame_ptr)) {
+ brelse(bh_src);
+ return PTR_ERR(frame_ptr);
+ }
+ }
+ WARN_ON(dx_get_block(frame_ptr->at) != lblk_src);
+ memcpy(bh_dst->b_data, bh_src->b_data, bh_dst->b_size);
+ dx_set_block(frame_ptr->at, lblk_dst);
+
+ /* Set pointer in bh_last_parent to bh */
+ ret = ext4_journal_get_write_access(handle, dir->i_sb, bh_src,
+ EXT4_JTR_NONE);
+ if (ret)
+ goto out;
+
+ ret = ext4_journal_get_write_access(handle, dir->i_sb, frame_ptr->bh,
+ EXT4_JTR_NONE);
+ if (ret)
+ goto out;
+
+ ret = ext4_handle_dirty_metadata(handle, dir, bh_src);
+ if (ret) {
+ ext4_std_error(dir->i_sb, ret);
+ goto out;
+ }
+
+ ret = ext4_handle_dirty_dx_node(handle, dir, frame_ptr->bh);
+ if (ret) {
+ ext4_std_error(dir->i_sb, ret);
+ goto out;
+ }
+out:
+ brelse(bh_src);
+ dx_release(frames);
+ return ret;
+}
+
+/*
+ * Try to shrink directory as much as possible. This function
+ * truncates directory to the new size.
+ *
+ * The high level algorithm is as follows:
+ *
+ * - If after dentry removal the dirent block (let's call it B) becomes
+ * empty, then remove its references in its dx parent.
+ * - Swap its contents with that of the last block (L) in directory.
+ * - Update L's parents to point to B instead.
+ * - Remove L
+ * - Repeat this for all the ancestors of B.
+ */
+static int ext4_try_dir_shrink(handle_t *handle, struct inode *dir,
+ ext4_lblk_t lblk, struct buffer_head *bh)
+{
+ struct buffer_head *bh_last = NULL, *parent_bh = NULL;
+ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame_ptr;
+ ext4_lblk_t shrink = 0, last_lblk, parent_lblk;
+ int ret = 0;
+ bool check_empty = true;
+
+ memset(frames, 0, sizeof(frames));
+ frame_ptr = dx_probe_dirent_blk(dir, frames, bh, lblk);
+ if (IS_ERR(frame_ptr))
+ return -EINVAL;
+
+ last_lblk = (dir->i_size >> dir->i_sb->s_blocksize_bits) - 1;
+
+ while (check_empty && frame_ptr >= frames) {
+ parent_lblk = frame_ptr > frames ?
+ dx_get_block((frame_ptr - 1)->at) : 0;
+ lblk = dx_get_block(frame_ptr->at);
+ if (ext4_journal_extend(
+ handle, EXT4_DIR_SHRINK_TRANS_BLOCKS, 0) < 0)
+ break;
+ if (ext4_remove_dx_entry(handle, dir, frame_ptr) < 0) {
+ dxtrace(pr_warn("remove dx entry failed\n"));
+ break;
+ }
+ check_empty = dx_get_count(frame_ptr->entries) == 0;
+ if (lblk != last_lblk) {
+ ret = ext4_dx_remap_block(handle, dir, bh, lblk,
+ last_lblk);
+ if (ret) {
+ dxtrace(pr_warn("remap failed\n"));
+ break;
+ }
+ }
+ ret = ext4_handle_dirty_dx_node(handle, dir, frame_ptr->bh);
+ if (ret)
+ break;
+ if (parent_lblk == last_lblk)
+ parent_lblk = lblk;
+ last_lblk--;
+ shrink++;
+
+ if (parent_lblk == 0)
+ break;
+ if (!IS_ERR_OR_NULL(parent_bh))
+ brelse(parent_bh);
+ parent_bh = ext4_bread(handle, dir, parent_lblk, 0);
+ if (IS_ERR_OR_NULL(parent_bh))
+ break;
+ dx_release(frames);
+ frame_ptr = dx_probe_dx_node(dir, frames, parent_bh,
+ parent_lblk);
+ if (IS_ERR(frame_ptr))
+ break;
+ bh = parent_bh;
+ }
+
+ /* Fallback to linear directories if root node becomes empty */
+ if (dx_get_count(frames[0].entries) == 0)
+ make_unindexed(handle, dir, frames[0].bh);
+ if (!IS_ERR(frame_ptr))
+ dx_release(frames);
+ if (!IS_ERR_OR_NULL(bh_last))
+ brelse(bh_last);
+ if (!IS_ERR_OR_NULL(parent_bh))
+ brelse(parent_bh);
+
+ if (ret || !shrink)
+ return ret;
+
+ dir->i_size -= shrink * dir->i_sb->s_blocksize;
+ return ext4_truncate(dir);
+}
+
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh, ext4_lblk_t lblk)
{
int err, csum_size = 0;
+ bool block_empty = false;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
@@ -2728,7 +3059,8 @@ static int ext4_delete_entry(handle_t *handle,
goto out;
err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
- dir->i_sb->s_blocksize, csum_size);
+ dir->i_sb->s_blocksize, csum_size,
+ &block_empty);
if (err)
goto out;
@@ -2737,6 +3069,9 @@ static int ext4_delete_entry(handle_t *handle,
if (unlikely(err))
goto out;
+ if (block_empty && is_dx(dir))
+ ext4_try_dir_shrink(handle, dir, lblk, bh);
+
return 0;
out:
if (err != -ENOENT)
--
2.51.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty
2025-09-28 21:02 ` Theodore Ts'o
2025-09-28 21:23 ` [PATCH 1/2] ext4: return lblk from ext4_find_entry Theodore Ts'o
@ 2025-09-29 2:33 ` Julian Sun
1 sibling, 0 replies; 13+ messages in thread
From: Julian Sun @ 2025-09-29 2:33 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: Harshad Shirwadkar, adilger, jack, Ext4 Developers List
Hi,
I really appreciate the detailed explanation—your explanation made the
current state of this patch set much clearer to me.
I’ll follow up with the work as you suggested.
On Mon, Sep 29, 2025 at 5:02 AM Theodore Ts'o <tytso@mit.edu> wrote:
>
> On Sun, Sep 28, 2025 at 02:51:09PM +0800, Julian Sun wrote:
> > Emm. I checked the code and found that support for 3-level htrees was
> > added in 2017 via commit e08ac99fa2a2 ("ext4: add largedir feature"),
> > but this patch was submitted in 2020. Did I make a mistake somewhere?
>
> I made that statement when I tried forward porting Harshad's patches
> to 6.17-rc4, and one of the patch conflicts seemed to implyyy that
> dx_probe predated the3-level htree change. But I could have been
> mistaken about why the patch conflict eixsts.
>
> > > There are
> > > also some hardening against maliciously fuzzed file systems that will
> > > prevent the patches from applying cleanly.
> >
> > Is this included in the xfstests test suite?
>
> For better or for worse, no. The reason for this is that xfstests
> tends to avoid using pre-generated file systems becase they aren't
> portable to different file system types. And in order to test
> deliberately corrupted, you generally need to include small corrupted
> file systems into xfstests. You *could* try to create them as an
> ext4-specific test using debugfs to introduce the corruption, but
> that's quite painful, and in some cases the only way to corrupt the
> file system in that very specific way would require a hex editor.
>
> That being said, it's generally pretty obvious when bullet-proofing
> code has been added and it causes a merge conflict. For example, I
> skipped forward-porting the third patch, "ext4: reimplement
> ext4_empty_dir() using is_dirent_block_empty" because there was some
> bullet-proofing code added in ext4_empty_dir() which caused the patch
> application to fail, and I was too third try to deal with it, and it
> strictly speaking wasn't necessary for the patch shrinking
> functionality. That being said, the bullet proofing which was added
> to ext4_empty_dir(), *should* be added to the newly created
> i_dirent_block_empty() function introduced in the 2/3 patch in this
> series.
>
> > > Then we'd need to run regression tests on a variety of different ext4
> > > configurations to see if there are any regressions, and if so, they
> > > would need to be fixed.
> >
> > Is testing with xfstests sufficient? Or are there any other test
> > suites that can be used to test this patch set?
>
> Testing with xfstests are sufficient, but we need to test multiple
> file system configurations, but just the default "ext4 using a 4k
> blocksize". So I tried to do a quick-and-dirty forward port of the
> patches, and they compiled and passed the 15-20 minute smoke test
> (running "kvm-xfstests smoke", or "gce-xfstests smoke"). But when I
> ran the full set of tests, this is what I found.
>
> ext4/4k: 594 tests, 4 failures, 65 skipped, 5196 seconds
> Failures: generic/426 generic/756
> Flaky: generic/041: 20% (1/5) generic/049: 20% (1/5)
> ext4/1k: 588 tests, 3 failures, 69 skipped, 6306 seconds
> Failures: generic/426 generic/756
> Flaky: generic/043: 20% (1/5)
> ext4/ext3: 586 tests, 3 failures, 149 skipped, 4958 seconds
> Flaky: generic/041: 40% (2/5) generic/426: 60% (3/5)
> generic/756: 60% (3/5)
> ext4/encrypt: 569 tests, 3 failures, 181 skipped, 3263 seconds
> Failures: generic/426 generic/756
> Flaky: generic/049: 40% (2/5)
> ext4/nojournal: 586 tests, 3 failures, 137 skipped, 3918 seconds
> Failures: ext4/045 generic/426 generic/756
> ext4/ext3conv: 591 tests, 4 failures, 67 skipped, 5408 seconds
> Failures: generic/426 generic/756
> Flaky: ext4/045: 40% (2/5) generic/040: 60% (3/5)
> ext4/adv: 587 tests, 10 failures, 73 skipped, 5487 seconds
> Failures: [generic/347] generic/426 generic/756 [generic/757] [generic/764]
> [generic/777]
> Flaky: generic/047: 40% (2/5) generic/475: 20% (1/5)
> [generic/482: 40% (2/5)] generic/547: 20% (1/5)
> ext4/dioread_nolock: 592 tests, 3 failures, 65 skipped, 4996 seconds
> Failures: generic/426 generic/756
> Flaky: generic/047: 40% (2/5)
> ext4/data_journal: 587 tests, 6 failures, 141 skipped, 5151 seconds
> Failures: [generic/127] generic/426 generic/756
> Flaky: generic/049: 60% (3/5) generic/475: 60% (3/5)
> generic/645: 40% (2/5)
> ext4/bigalloc_4k: 565 tests, 4 failures, 68 skipped, 5066 seconds
> Failures: ext4/045
> Flaky: generic/320: 60% (3/5) generic/426: 60% (3/5)
> generic/756: 60% (3/5)
> ext4/bigalloc_1k: 566 tests, 3 failures, 79 skipped, 5096 seconds
> Failures: generic/426 generic/756
> Flaky: generic/049: 20% (1/5)
> ext4/dax: 578 tests, 5 failures, 170 skipped, 3198 seconds
> Failures: [generic/344] [generic/363] generic/426 generic/756
> Flaky: generic/043: 20% (1/5)
> Totals: 7193 tests, 1264 skipped, 190 failures, 0 errors, 53862s
>
> (Tests in square brackets were failing with stock 6.17-rc4, so don't
> represent regressions. Ignore them for the purposes of trying to fix
> up this patch set.)
>
> Now, some of thsee may be failures caused by my making a mistake when
> doing the forward port. Basically, I bashed the patches until they
> built; and then I ran the smoke test; and then I kicked of the full
> test run, which takes about 2.5 hours using a dozen VM's.
>
> I don't have time to take this any further, so with your permission,
> if you're OK with my handing fiishing off this project to you, I'll
> send the patches as a reply to this message, and then "Your mission,
> should you use to accept it" (quoting from the "Mission Impossible" TV
> Show / Movie) would be:
>
> 1) Investigate the failures deailed above, and fix them. Again, some
> of these failures may not be Harshads; it could be mine when I did the
> forward port. Either way, we need to fix them before we can merge the
> changes upstream.
>
> 2) Do more cleanups on the patches as necessary. While doing the
> forward port, I noted the following issues that should really be
> fixed. You might find more things that need improvement; if so,
> please go for it!
>
> 2a) Although we are potentially modifying more metadata blocks in
> ext4_try_dir_shrink(), the number of journal credits requested by
> callers to ext4_delete_entry() were not increased. This could result
> in the handle running out of credits, which will trigger a warning,
> and if the transaction runs out of space, could trigger a journal
> abort. One way would simply be to increase the credits passed to
> ext4_journal_start().
>
> The other way would be to queue up the work and do the directory
> shrinking in workqueue, where the changes would be done in a
> completely separate journal handle. This has the advantage that it
> avoids increasing the latency to the unlink() system call, since there
> is no reason why the change needs to happen as part of the system
> call, or in the same transaction as the unlink.
>
> 2b) Error checking is missing in some of the newly added code. In
> particular the function calls in make_unindexed() are ignoring error
> returns. More generally, while we do the directory shrnk, we need to
> check for potential problems; and if there are any failures, we need
> to leave the directory unchanged, possibly calling ext4_error() if
> file system corruption is dicovered, or just skipping the directory
> shirnk operation if it is some transient error --- for example, a
> memory allocation failure.
>
> 2c) As mentioned above, I skipped forward porting the 3rd patch
> series, and there is code to rigorously check for inconsistent file
> system corruptions lest we trigger a BUG or WARN or worse which is
> causing the patch application to fail. That checking needs to be
> added to is_dirent_block_empty(), and while you're at it, please check
> all of the newly added code to make sure it won't misbehave if the
> file system structures have been corrupted in a particualrly inventive
> way.
>
> 2d) Once (2c) is done forward port patch #3.
>
> 3) As discussed in this thread, once these patches are landed, the
> further work to merge leaf notes; to remove empty htree nodes; and to
> shorten the htree depth when necessary.
>
> > > Also, please note that this first set of changes doesn't really make a
> > > big difference for real-world use casses, since a directory block
> > > won't get dropped when it is completely empty....
> >
> > Yes, I think the biggest beneficiary is rm -rf-type workloads.
>
> The thing about rm -rf workloads is that if you eventually rmdir() the
> directory, all of the blocks will be released anyway. That's probably
> why this feature is one that hasn't been high priority. If you delete
> 75% of the files in a directory, you *could* do something like "mkdir
> foo.new ; mv foo/* foo.new ; rmdir foo ; mv foo.new foo", but of
> course that won't work if some other process is actively accessing the
> files when you are trying to do the "DIY directory shrink operation".
>
> Cheers,
>
> - Ted
>
Thanks,
--
Julian Sun <sunjunchao2870@gmail.com>
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2025-09-29 2:33 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-04-07 6:46 [PATCH v2 1/3] ext4: return lblk from ext4_find_entry Harshad Shirwadkar
2020-04-07 6:46 ` [PATCH v2 2/3] ext4: shrink directories on dentry delete Harshad Shirwadkar
2020-04-07 16:51 ` Andreas Dilger
2020-04-07 6:46 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Harshad Shirwadkar
2020-04-07 17:21 ` Andreas Dilger
2020-04-08 9:31 ` Jan Kara
[not found] ` <c7a41ba13a3551fca25d7498b9d4542a104fac74.camel@gmail.com>
2025-09-27 8:11 ` Julian Sun
2025-09-28 3:46 ` Theodore Ts'o
2025-09-28 6:51 ` Julian Sun
2025-09-28 21:02 ` Theodore Ts'o
2025-09-28 21:23 ` [PATCH 1/2] ext4: return lblk from ext4_find_entry Theodore Ts'o
2025-09-28 21:23 ` [PATCH 2/2] ext4: shrink directories on dentry delete Theodore Ts'o
2025-09-29 2:33 ` [PATCH v2 3/3] ext4: reimplement ext4_empty_dir() using is_dirent_block_empty Julian Sun
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).