[PATCH] various allocator optimizations

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] various allocator optimizations
@ 2003-03-11 16:34 Chris Mason
  2003-03-11 16:42 ` Oleg Drokin
  0 siblings, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-11 16:34 UTC (permalink / raw)
  To: reiserfs-list

[-- Attachment #1: Type: text/plain, Size: 1545 bytes --]

Hello all,

There are 3 patches attached

group-alloc-4.diff	    # against 2.4.21-pre + data logging + quota
group-alloc-4-vanilla.diff  # against 2.4.21-pre

These two are block allocator ideas from Jeff Mahoney and I, it:

changes blocknrs_and_prealloc_arrays_from_search_start into three
passes.  pass1 goes from the hint to the end of the disk, pass2 goes
from the border to the hint, and pass3 goes from the start of the disk
to the border.

This was done because the old system would start over at the beginning
of the disk if the first pass failed, which would allocate data blocks
inside the border reserved for formatted nodes.

changes get_left_neighbor to return 1 if it finds a good block number to
use as the allocation hint.  This lets us call the hashing/grouping
schemes only when get_left_neighbor doesn't find something useful in the
path.

Adds alloc=dirid_groups and alloc=oid_groups mount options.  They hash
into the disk for data block allocations, and then round the hash down
to a 64MB boundary within the bitmap block.

changes the default alloc parameters to
dirid_groups:skip_busy:concentrating_formatted_nodes=5

Overall, I believe this will significantly improve fragmentation over
time.  oid_groups should only be used if your FS has a small number of
large files in each directory, or a very large number of files in a
single directory.

Also attached is search_reada-4.diff, which implements readahead on leaf
nodes during search_by_key calls.  It applies to both the data logging
and vanilla kernels.

-chris



[-- Attachment #2: group-alloc-4-vanilla.diff --]
[-- Type: text/plain, Size: 10670 bytes --]

--- 1.18/fs/reiserfs/bitmap.c	Thu Sep 12 04:39:21 2002
+++ edited/fs/reiserfs/bitmap.c	Tue Mar 11 11:13:45 2003
@@ -33,6 +33,8 @@
 #define  _ALLOC_hashed_formatted_nodes 7
 #define  _ALLOC_old_way 8
 #define  _ALLOC_hundredth_slices 9
+#define  _ALLOC_dirid_groups 10
+#define  _ALLOC_oid_groups 11
 
 #define  concentrating_formatted_nodes(s)     test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
 #define  displacing_large_files(s)            test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
@@ -260,8 +262,18 @@
     get_bit_address (s, *start, &bm, &off);
     get_bit_address (s, finish, &end_bm, &end_off);
 
-    // With this option set first we try to find a bitmap that is at least 10%
-    // free, and if that fails, then we fall back to old whole bitmap scanning
+    /* When the bitmap is more than 10% free, anyone can allocate.
+     * When it's less than 10% free, only files that already use the
+     * bitmap are allowed. Once we pass 80% full, this restriction
+     * is lifted.
+     *
+     * We do this so that files that grow later still have space close to
+     * their original allocation. This improves locality, and presumably
+     * performance as a result.
+     *
+     * This is only an allocation policy and does not make up for getting a
+     * bad hint. Decent hinting must be implemented for this to work well.
+     */
     if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
 	for (;bm < end_bm; bm++, off = 0) {
 	    if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
@@ -392,6 +404,16 @@
   }
 }
 
+void reiserfs_init_alloc_options (struct super_block *s)
+{
+    set_bit (_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s));
+    set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+    set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+    s->u.reiserfs_sb.s_alloc_options.border = 5;
+
+    reiserfs_warning ("allocator defaults = [%08x]\n", SB_ALLOC_OPTS(s));
+}
+
 /* block allocator related options are parsed here */
 int reiserfs_parse_alloc_options(struct super_block * s, char * options)
 {
@@ -435,6 +457,15 @@
 	    continue;
 	}
 
+        if (!strcmp(this_char, "dirid_groups")) {
+	    SET_OPTION(dirid_groups);
+	    continue;
+        }
+        if (!strcmp(this_char, "oid_groups")) {
+	    SET_OPTION(oid_groups);
+	    continue;
+        }
+
 	if (!strcmp(this_char, "hashed_formatted_nodes")) {
 	    SET_OPTION(hashed_formatted_nodes);
 	    continue;
@@ -476,6 +507,7 @@
 	return 1;
     }
 
+    reiserfs_warning ("allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
     return 0;
 }
 
@@ -498,17 +530,81 @@
     hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
 }
 
-static void inline get_left_neighbor(reiserfs_blocknr_hint_t *hint)
+/* 
+ * Relocation based on dirid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them 
+ */
+static void 
+dirid_groups (reiserfs_blocknr_hint_t *hint)
+{
+    if (hint->inode) {
+        char * hash_in = NULL;
+	unsigned long hash;
+	unsigned long mask;
+	__u32 dirid;
+
+	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+	hash_in = (char *)(&dirid);
+
+	/* effectively turns the disk in 64MB groups (4k blocksize), 
+	 * but the bigger your disk is the less likely hash collisions
+	 * are, leading to dynamically bigger groups based on 
+	 * your disk size
+	 */
+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
+	hash = keyed_hash(hash_in, 4);
+	hash = hint->beg + hash % (hint->end - hint->beg);
+	hash &= ~mask;
+
+	hint->search_start = hash;
+    }
+}
+
+/* 
+ * Relocation based on oid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them 
+ */
+static void 
+oid_groups (reiserfs_blocknr_hint_t *hint)
+{
+    if (hint->inode) {
+        char * hash_in = NULL;
+	unsigned long hash;
+	unsigned long mask;
+	__u32 oid;
+
+	oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+	hash_in = (char *)(&oid);
+
+	/* effectively turns the disk in 64MB groups (4k blocksize), 
+	 * but the bigger your disk is the less likely hash collisions
+	 * are, leading to dynamically bigger groups based on 
+	 * your disk size
+	 */
+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
+	hash = keyed_hash(hash_in, 4);
+	hash = hint->beg + hash % (hint->end - hint->beg);
+	hash &= ~mask;
+
+	hint->search_start = hash;
+    }
+}
+
+/* returns 1 if it finds an indirect item and gets valid hint info
+ * from it, otherwise 0
+ */
+static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
 {
     struct path * path;
     struct buffer_head * bh;
     struct item_head * ih;
     int pos_in_item;
     __u32 * item;
+    int ret = 0;
 
     if (!hint->path)		/* reiserfs code can call this function w/o pointer to path
 				 * structure supplied; then we rely on supplied search_start */
-	return;
+	return 0;
 
     path = hint->path;
     bh = get_last_bh(path);
@@ -529,6 +625,7 @@
 	    int t=get_block_num(item,pos_in_item);
 	    if (t) {
 		hint->search_start = t;
+		ret = 1;
 		break;
 	    }
 	    pos_in_item --;
@@ -537,7 +634,7 @@
     }
 
     /* does result value fit into specified region? */
-    return;
+    return ret;
 }
 
 /* should be, if formatted node, then try to put on first part of the device
@@ -640,6 +737,7 @@
     struct super_block *s = hint->th->t_super;
     hint->beg = 0;
     hint->end = SB_BLOCK_COUNT(s) - 1;
+    int unfm_hint;
 
     /* This is former border algorithm. Now with tunable border offset */
     if (concentrating_formatted_nodes(s))
@@ -668,19 +766,14 @@
 	return;
     }
 
-    /* attempt to copy a feature from old block allocator code */
-    if (TEST_OPTION(old_hashed_relocation, s) && !hint->formatted_node) {
-	old_hashed_relocation(hint);
-    }
-
     /* if none of our special cases is relevant, use the left neighbor in the
        tree order of the new node we are allocating for */
     if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
-	hash_formatted_node(hint);
+        hash_formatted_node(hint);
 	return;
-    } 
+    }
 
-    get_left_neighbor(hint);
+    unfm_hint = get_left_neighbor(hint);
 
     /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
        new blocks are displaced based on directory ID. Also, if suggested search_start
@@ -705,10 +798,29 @@
 	return;
     }
 
-    if (TEST_OPTION(old_hashed_relocation, s))
+    /* old_hashed_relocation only works on unformatted */
+    if (!unfm_hint && !hint->formatted_node &&
+        TEST_OPTION(old_hashed_relocation, s))
+    {
 	old_hashed_relocation(hint);
-    if (TEST_OPTION(new_hashed_relocation, s))
+    }
+    /* new_hashed_relocation works with both formatted/unformatted nodes */
+    if ((!unfm_hint || hint->formatted_node) &&
+        TEST_OPTION(new_hashed_relocation, s))
+    {
 	new_hashed_relocation(hint);
+    }
+    /* dirid grouping works only on unformatted nodes */
+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
+    {
+        dirid_groups(hint);
+    }
+
+    /* oid grouping works only on unformatted nodes */
+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
+    {
+        oid_groups(hint);
+    }
     return;
 }
 
@@ -772,28 +884,35 @@
     struct super_block *s = hint->th->t_super;
     b_blocknr_t start = hint->search_start;
     b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
-    int second_pass = 0;
+    int passno = 0;
     int nr_allocated = 0;
 
     determine_prealloc_size(hint);
-    while((nr_allocated
-	  += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
-					  amount_needed - nr_allocated, hint->prealloc_size))
-	  < amount_needed) {
-
-	/* not all blocks were successfully allocated yet*/
-	if (second_pass) {	/* it was a second pass; we must free all blocks */
+    do {
+	switch (passno++) {
+        case 0: /* Search from hint->search_start to end of disk */
+            start = hint->search_start;
+            finish = SB_BLOCK_COUNT(s) - 1;
+            break;
+        case 1: /* Search from hint->beg to hint->search_start */
+            start = hint->beg;
+            finish = hint->search_start;
+            break;
+        case 2: /* Last chance: Search from 0 to hint->beg */
+            start = 0;
+            finish = hint->beg;
+	    break;
+        default: /* We've tried searching everywhere, not enough space */
+	    /* not all blocks were successfully allocated yet*/
 	    while (nr_allocated --)
 		reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
 
 	    return NO_DISK_SPACE;
-	} else {		/* refine search parameters for next pass */
-	    second_pass = 1;
-	    finish = start;
-	    start = 0;
-	    continue;
-	}
-    }
+        }
+    } while((nr_allocated += allocate_without_wrapping_disk(hint, 
+                          new_blocknrs + nr_allocated, start, finish,
+			  amount_needed - nr_allocated, hint->prealloc_size)) 
+			  < amount_needed) ;
     return CARRY_ON;
 }
 
--- 1.27/fs/reiserfs/super.c	Wed Oct 30 11:42:36 2002
+++ edited/fs/reiserfs/super.c	Tue Mar 11 10:49:36 2003
@@ -1121,17 +1121,16 @@
     int old_magic;
     struct reiserfs_super_block * rs;
 
-
     memset (&s->u.reiserfs_sb, 0, sizeof (struct reiserfs_sb_info));
 
     /* Set default values for options: non-aggressive tails */
     s->u.reiserfs_sb.s_mount_opt = ( 1 << REISERFS_SMALLTAIL );
-    /* default block allocator option: skip_busy */
-    s->u.reiserfs_sb.s_alloc_options.bits = ( 1 << 5);
     /* If file grew past 4 blocks, start preallocation blocks for it. */
     s->u.reiserfs_sb.s_alloc_options.preallocmin = 4;
     /* Preallocate by 8 blocks (9-1) at once */
     s->u.reiserfs_sb.s_alloc_options.preallocsize = 9;
+    /* setup default block allocator options */
+    reiserfs_init_alloc_options(s);
 
     if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
 	return NULL;
--- 1.26/include/linux/reiserfs_fs.h	Mon Jan 20 05:19:30 2003
+++ edited/include/linux/reiserfs_fs.h	Tue Mar 11 10:52:16 2003
@@ -1953,6 +1953,7 @@
 typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
 
 int reiserfs_parse_alloc_options (struct super_block *, char *);
+void reiserfs_init_alloc_options (struct super_block *s);
 int is_reusable (struct super_block * s, unsigned long block, int bit_value);
 void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long);
 int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);

[-- Attachment #3: group-alloc-4.diff --]
[-- Type: text/plain, Size: 12405 bytes --]

diff -ur linux.beta3.1/fs/reiserfs/bitmap.c linux.beta3/fs/reiserfs/bitmap.c
--- linux.beta3.1/fs/reiserfs/bitmap.c	2003-03-10 20:36:37.000000000 -0500
+++ linux.beta3/fs/reiserfs/bitmap.c	2003-03-11 09:50:26.000000000 -0500
@@ -34,6 +34,8 @@
 #define  _ALLOC_hashed_formatted_nodes 7
 #define  _ALLOC_old_way 8
 #define  _ALLOC_hundredth_slices 9
+#define  _ALLOC_dirid_groups 10
+#define  _ALLOC_oid_groups 11
 
 #define  concentrating_formatted_nodes(s)     test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
 #define  displacing_large_files(s)            test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
@@ -261,8 +263,18 @@
     get_bit_address (s, *start, &bm, &off);
     get_bit_address (s, finish, &end_bm, &end_off);
 
-    // With this option set first we try to find a bitmap that is at least 10%
-    // free, and if that fails, then we fall back to old whole bitmap scanning
+    /* When the bitmap is more than 10% free, anyone can allocate.
+     * When it's less than 10% free, only files that already use the
+     * bitmap are allowed. Once we pass 80% full, this restriction
+     * is lifted.
+     *
+     * We do this so that files that grow later still have space close to
+     * their original allocation. This improves locality, and presumably
+     * performance as a result.
+     *
+     * This is only an allocation policy and does not make up for getting a
+     * bad hint. Decent hinting must be implemented for this to work well.
+     */
     if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
 	for (;bm < end_bm; bm++, off = 0) {
 	    if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
@@ -408,6 +420,16 @@
   }
 }
 
+void reiserfs_init_alloc_options (struct super_block *s)
+{
+    set_bit (_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s));
+    set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+    set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+    s->u.reiserfs_sb.s_alloc_options.border = 5;
+
+    reiserfs_warning ("allocator defaults = [%08x]\n", SB_ALLOC_OPTS(s));
+}
+
 /* block allocator related options are parsed here */
 int reiserfs_parse_alloc_options(struct super_block * s, char * options)
 {
@@ -451,6 +473,15 @@
 	    continue;
 	}
 
+        if (!strcmp(this_char, "dirid_groups")) {
+	    SET_OPTION(dirid_groups);
+	    continue;
+        }
+        if (!strcmp(this_char, "oid_groups")) {
+	    SET_OPTION(oid_groups);
+	    continue;
+        }
+
 	if (!strcmp(this_char, "hashed_formatted_nodes")) {
 	    SET_OPTION(hashed_formatted_nodes);
 	    continue;
@@ -492,6 +523,7 @@
 	return 1;
     }
 
+    reiserfs_warning ("allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
     return 0;
 }
 
@@ -514,17 +546,81 @@
     hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
 }
 
-static void inline get_left_neighbor(reiserfs_blocknr_hint_t *hint)
+/* 
+ * Relocation based on dirid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them 
+ */
+static void 
+dirid_groups (reiserfs_blocknr_hint_t *hint)
+{
+    if (hint->inode) {
+        char * hash_in = NULL;
+	unsigned long hash;
+	unsigned long mask;
+	__u32 dirid;
+
+	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+	hash_in = (char *)(&dirid);
+
+	/* effectively turns the disk in 64MB groups (4k blocksize), 
+	 * but the bigger your disk is the less likely hash collisions
+	 * are, leading to dynamically bigger groups based on 
+	 * your disk size
+	 */
+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
+	hash = keyed_hash(hash_in, 4);
+	hash = hint->beg + hash % (hint->end - hint->beg);
+	hash &= ~mask;
+
+	hint->search_start = hash;
+    }
+}
+
+/* 
+ * Relocation based on oid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them 
+ */
+static void 
+oid_groups (reiserfs_blocknr_hint_t *hint)
+{
+    if (hint->inode) {
+        char * hash_in = NULL;
+	unsigned long hash;
+	unsigned long mask;
+	__u32 oid;
+
+	oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+	hash_in = (char *)(&oid);
+
+	/* effectively turns the disk in 64MB groups (4k blocksize), 
+	 * but the bigger your disk is the less likely hash collisions
+	 * are, leading to dynamically bigger groups based on 
+	 * your disk size
+	 */
+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
+	hash = keyed_hash(hash_in, 4);
+	hash = hint->beg + hash % (hint->end - hint->beg);
+	hash &= ~mask;
+
+	hint->search_start = hash;
+    }
+}
+
+/* returns 1 if it finds an indirect item and gets valid hint info
+ * from it, otherwise 0
+ */
+static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
 {
     struct path * path;
     struct buffer_head * bh;
     struct item_head * ih;
     int pos_in_item;
     __u32 * item;
+    int ret = 0;
 
     if (!hint->path)		/* reiserfs code can call this function w/o pointer to path
 				 * structure supplied; then we rely on supplied search_start */
-	return;
+	return 0;
 
     path = hint->path;
     bh = get_last_bh(path);
@@ -545,6 +641,7 @@
 	    int t=get_block_num(item,pos_in_item);
 	    if (t) {
 		hint->search_start = t;
+		ret = 1;
 		break;
 	    }
 	    pos_in_item --;
@@ -553,7 +650,7 @@
     }
 
     /* does result value fit into specified region? */
-    return;
+    return ret;
 }
 
 /* should be, if formatted node, then try to put on first part of the device
@@ -655,6 +752,7 @@
     struct super_block *s = hint->th->t_super;
     hint->beg = 0;
     hint->end = SB_BLOCK_COUNT(s) - 1;
+    int unfm_hint;
 
     /* This is former border algorithm. Now with tunable border offset */
     if (concentrating_formatted_nodes(s))
@@ -683,19 +781,14 @@
 	return;
     }
 
-    /* attempt to copy a feature from old block allocator code */
-    if (TEST_OPTION(old_hashed_relocation, s) && !hint->formatted_node) {
-	old_hashed_relocation(hint);
-    }
-
     /* if none of our special cases is relevant, use the left neighbor in the
        tree order of the new node we are allocating for */
     if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
-	hash_formatted_node(hint);
+        hash_formatted_node(hint);
 	return;
-    } 
+    }
 
-    get_left_neighbor(hint);
+    unfm_hint = get_left_neighbor(hint);
 
     /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
        new blocks are displaced based on directory ID. Also, if suggested search_start
@@ -720,10 +813,29 @@
 	return;
     }
 
-    if (TEST_OPTION(old_hashed_relocation, s))
+    /* old_hashed_relocation only works on unformatted */
+    if (!unfm_hint && !hint->formatted_node &&
+        TEST_OPTION(old_hashed_relocation, s))
+    {
 	old_hashed_relocation(hint);
-    if (TEST_OPTION(new_hashed_relocation, s))
+    }
+    /* new_hashed_relocation works with both formatted/unformatted nodes */
+    if ((!unfm_hint || hint->formatted_node) &&
+        TEST_OPTION(new_hashed_relocation, s))
+    {
 	new_hashed_relocation(hint);
+    }
+    /* dirid grouping works only on unformatted nodes */
+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
+    {
+        dirid_groups(hint);
+    }
+
+    /* oid grouping works only on unformatted nodes */
+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
+    {
+        oid_groups(hint);
+    }
     return;
 }
 
@@ -787,7 +899,7 @@
     struct super_block *s = hint->th->t_super;
     b_blocknr_t start = hint->search_start;
     b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
-    int second_pass = 0;
+    int passno = 0;
     int nr_allocated = 0;
 
     determine_prealloc_size(hint);
@@ -809,30 +921,44 @@
 	}
     }
 
-    while((nr_allocated
-	  += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
-					  amount_needed - nr_allocated, hint->prealloc_size))
-	  < amount_needed) {
-
-	/* not all blocks were successfully allocated yet*/
-	if (second_pass) {	/* it was a second pass; we must free all blocks */
+    do {
+        switch (passno++) {
+        case 0: /* Search from hint->search_start to end of disk */
+            start = hint->search_start;
+            finish = SB_BLOCK_COUNT(s) - 1;
+            break;
+        case 1: /* Search from hint->beg to hint->search_start */
+            start = hint->beg;
+            finish = hint->search_start;
+            break;
+        case 2: /* Last chance: Search from 0 to hint->beg */
+            start = 0;
+            finish = hint->beg;
+            break;
+        default: /* We've tried searching everywhere, not enough space */
 	    if (!hint->formatted_node) {
 #ifdef REISERQUOTA_DEBUG
-		printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
+                printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n",
+                            amount_needed + hint->prealloc_size - nr_allocated,
+                            hint->inode->i_uid);
 #endif
-		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);     /* Free not allocated blocks */
+                /* Free not allocated blocks */
+                DQUOT_FREE_BLOCK_NODIRTY(hint->inode,
+                        amount_needed + hint->prealloc_size - nr_allocated);
 	    }
+            /* Free the blocks */
 	    while (nr_allocated --)
-		reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
+		reiserfs_free_block(hint->th, hint->inode,
+                                    new_blocknrs[nr_allocated],
+                                    !hint->formatted_node);
+
+            return NO_DISK_SPACE;
+        }
+    } while ((nr_allocated += allocate_without_wrapping_disk (hint,
+                            new_blocknrs + nr_allocated, start, finish,
+                            amount_needed - nr_allocated, hint->prealloc_size))
+                        < amount_needed);
 
-	    return NO_DISK_SPACE;
-	} else {		/* refine search parameters for next pass */
-	    second_pass = 1;
-	    finish = start;
-	    start = 0;
-	    continue;
-	}
-    }
     if ( !hint->formatted_node && amount_needed + hint->prealloc_size > nr_allocated + INODE_INFO(hint->inode)->i_prealloc_count) {
     /* Some of preallocation blocks were not allocated */
 #ifdef REISERQUOTA_DEBUG
diff -ur linux.beta3.1/fs/reiserfs/super.c linux.beta3/fs/reiserfs/super.c
--- linux.beta3.1/fs/reiserfs/super.c	2003-03-10 20:36:37.000000000 -0500
+++ linux.beta3/fs/reiserfs/super.c	2003-03-10 20:37:31.000000000 -0500
@@ -1288,18 +1288,17 @@
     char *jdev_name;
     struct reiserfs_super_block * rs;
 
-
     memset (&s->u.reiserfs_sb, 0, sizeof (struct reiserfs_sb_info));
     /* Set default values for options: non-aggressive tails */
     s->u.reiserfs_sb.s_mount_opt = ( 1 << REISERFS_SMALLTAIL );
-    /* default block allocator option: skip_busy */
-    s->u.reiserfs_sb.s_alloc_options.bits = ( 1 << 5);
     /* If file grew past 4 blocks, start preallocation blocks for it. */
     s->u.reiserfs_sb.s_alloc_options.preallocmin = 4;
     /* Preallocate by 8 blocks (9-1) at once */
     s->u.reiserfs_sb.s_alloc_options.preallocsize = 9;
     /* Initialize the rwsem for xattr dir */
     init_rwsem(&s->u.reiserfs_sb.xattr_dir_sem);
+    /* Setup default block allocator options */
+    reiserfs_init_alloc_options (s);
 
     if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
       return NULL;
diff -ur linux.beta3.1/include/linux/reiserfs_fs.h linux.beta3/include/linux/reiserfs_fs.h
--- linux.beta3.1/include/linux/reiserfs_fs.h	2003-03-10 20:36:37.000000000 -0500
+++ linux.beta3/include/linux/reiserfs_fs.h	2003-03-10 20:41:55.000000000 -0500
@@ -2228,6 +2228,7 @@
 typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
 
 int reiserfs_parse_alloc_options (struct super_block *, char *);
+void reiserfs_init_alloc_options (struct super_block *s);
 int is_reusable (struct super_block * s, unsigned long block, int bit_value);
 void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *inode, unsigned long, int);
 int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);

[-- Attachment #4: search_reada-4.diff --]
[-- Type: text/plain, Size: 3136 bytes --]

===== fs/reiserfs/stree.c 1.19 vs edited =====
--- 1.19/fs/reiserfs/stree.c	Tue Jul 16 09:58:57 2002
+++ edited/fs/reiserfs/stree.c	Wed Jul 17 12:02:17 2002
@@ -598,26 +598,32 @@
 
 
 
-#ifdef SEARCH_BY_KEY_READA
+#define SEARCH_BY_KEY_READA 32
 
 /* The function is NOT SCHEDULE-SAFE! */
-static void search_by_key_reada (struct super_block * s, int blocknr)
+static void search_by_key_reada (struct super_block * s, 
+                                 struct buffer_head **bh, 
+				 unsigned long *b, int num)
 {
-    struct buffer_head * bh;
+    int i,j;
   
-    if (blocknr == 0)
-	return;
-
-    bh = getblk (s->s_dev, blocknr, s->s_blocksize);
-  
-    if (!buffer_uptodate (bh)) {
-	ll_rw_block (READA, 1, &bh);
+    for (i = 0 ; i < num ; i++) {
+	bh[i] = sb_getblk (s, b[i]);
+	if (buffer_uptodate(bh[i])) {
+	    brelse(bh[i]);
+	    break;
+	}
+	touch_buffer(bh[i]);
+    } 
+    if (i) {
+	ll_rw_block(READA, i, bh);
+    }
+    for(j = 0 ; j < i ; j++) {
+        if (bh[j])
+	    brelse(bh[j]);
     }
-    bh->b_count --;
 }
 
-#endif
-
 /**************************************************************************
  * Algorithm   SearchByKey                                                *
  *             look for item in the Disk S+Tree by its key                *
@@ -660,6 +666,9 @@
     int				n_node_level, n_retval;
     int 			right_neighbor_of_leaf_node;
     int				fs_gen;
+    struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
+    unsigned long      reada_blocks[SEARCH_BY_KEY_READA];
+    int reada_count = 0;
 
 #ifdef CONFIG_REISERFS_CHECK
     int n_repeat_counter = 0;
@@ -694,11 +703,11 @@
 	fs_gen = get_generation (p_s_sb);
 	expected_level --;
 
-#ifdef SEARCH_BY_KEY_READA
-	/* schedule read of right neighbor */
-	search_by_key_reada (p_s_sb, right_neighbor_of_leaf_node);
-#endif
-
+	/* schedule read of right neighbors */
+	if (reada_count) {
+	    search_by_key_reada (p_s_sb, reada_bh, reada_blocks, reada_count);
+	    reada_count = 0;
+	}
 	/* Read the next tree node, and set the last element in the path to
            have a pointer to it. */
 	if ( ! (p_s_bh = p_s_last_element->pe_buffer =
@@ -786,11 +795,19 @@
 	   position in the node. */
 	n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
 
-#ifdef SEARCH_BY_KEY_READA
-	/* if we are going to read leaf node, then calculate its right neighbor if possible */
-	if (n_node_level == DISK_LEAF_NODE_LEVEL + 1 && p_s_last_element->pe_position < B_NR_ITEMS (p_s_bh))
-	    right_neighbor_of_leaf_node = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position + 1);
-#endif
+	/* if we are going to read leaf node, then try to find good leaves
+	** for read ahead as well.  Don't bother for stat data though
+	*/
+	if (n_node_level == DISK_LEAF_NODE_LEVEL + 1 && 
+	    p_s_last_element->pe_position < B_NR_ITEMS (p_s_bh) &&
+	    !is_statdata_cpu_key(p_s_key))
+	{
+	    int pos = p_s_last_element->pe_position;
+	    int limit = B_NR_ITEMS(p_s_bh);
+	    while(pos <= limit && reada_count < SEARCH_BY_KEY_READA) { 
+	        reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos++);
+	    }
+        }
     }
 }
 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 16:34 [PATCH] various allocator optimizations Chris Mason
@ 2003-03-11 16:42 ` Oleg Drokin
  2003-03-11 17:32   ` Chris Mason
  2003-03-12 19:57   ` Chris Mason
  0 siblings, 2 replies; 42+ messages in thread
From: Oleg Drokin @ 2003-03-11 16:42 UTC (permalink / raw)
  To: Chris Mason; +Cc: reiserfs-list

Hello!

On Tue, Mar 11, 2003 at 11:34:43AM -0500, Chris Mason wrote:

> changes blocknrs_and_prealloc_arrays_from_search_start into three
> passes.  pass1 goes from the hint to the end of the disk, pass2 goes
> from the border to the hint, and pass3 goes from the start of the disk
> to the border.

As you probably remember, we decided to drop border stiff all together
because of all the extra seeking it incurrs.

> Overall, I believe this will significantly improve fragmentation over
> time.  oid_groups should only be used if your FS has a small number of

I hope we won't have read-access speed degradation with these.

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 16:42 ` Oleg Drokin
@ 2003-03-11 17:32   ` Chris Mason
  2003-03-11 18:04     ` Oleg Drokin
  2003-03-11 21:42     ` Hans Reiser
  2003-03-12 19:57   ` Chris Mason
  1 sibling, 2 replies; 42+ messages in thread
From: Chris Mason @ 2003-03-11 17:32 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: reiserfs-list

On Tue, 2003-03-11 at 11:42, Oleg Drokin wrote:
> Hello!
> 
> On Tue, Mar 11, 2003 at 11:34:43AM -0500, Chris Mason wrote:
> 
> > changes blocknrs_and_prealloc_arrays_from_search_start into three
> > passes.  pass1 goes from the hint to the end of the disk, pass2 goes
> > from the border to the hint, and pass3 goes from the start of the disk
> > to the border.
> 
> As you probably remember, we decided to drop border stiff all together
> because of all the extra seeking it incurrs.
> 

The border does do extra seeks for some cases (search_reada helps), but
no border at all spreads tree blocks all over.  That too does a lot of
seeks, since leaves and the formatted nodes that point to them might be
on entirely different areas of the disk.

> > Overall, I believe this will significantly improve fragmentation over
> > time.  oid_groups should only be used if your FS has a small number of
> 
> I hope we won't have read-access speed degradation with these.

It does, but so does skip_busy alone.  You don't see the problem with
skip_busy during a mongo run, but run stress.sh -n 1 <data set that uses
50% of the disk> for a few hours and then run mongo again without
deleting the stress.sh data set.

The 2.4.20 default is great on a clean FS but breaks down over time,
just like the 2.4.19 allocator did.  Various people have demonstrated it
with benchmarks.

-chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 17:32   ` Chris Mason
@ 2003-03-11 18:04     ` Oleg Drokin
  2003-03-11 19:00       ` Chris Mason
  2003-03-11 21:42     ` Hans Reiser
  1 sibling, 1 reply; 42+ messages in thread
From: Oleg Drokin @ 2003-03-11 18:04 UTC (permalink / raw)
  To: Chris Mason; +Cc: reiserfs-list

Hello!

On Tue, Mar 11, 2003 at 12:32:48PM -0500, Chris Mason wrote:
> > > changes blocknrs_and_prealloc_arrays_from_search_start into three
> > > passes.  pass1 goes from the hint to the end of the disk, pass2 goes
> > > from the border to the hint, and pass3 goes from the start of the disk
> > > to the border.
> > As you probably remember, we decided to drop border stiff all together
> > because of all the extra seeking it incurrs.
> The border does do extra seeks for some cases (search_reada helps), but
> no border at all spreads tree blocks all over.  That too does a lot of

I'd say that no border makes tree blocks to appear near file data locations
(at least at file time creation, items might be shifted away later).

> seeks, since leaves and the formatted nodes that point to them might be
> on entirely different areas of the disk.

Well, leaves _are_ formatted nodes after all ;)

> > > Overall, I believe this will significantly improve fragmentation over
> > > time.  oid_groups should only be used if your FS has a small number of
> > I hope we won't have read-access speed degradation with these.
> It does, but so does skip_busy alone.  You don't see the problem with

But we save on cpu here, I think. No?
I am surprised this is noticeable at all.

> skip_busy during a mongo run, but run stress.sh -n 1 <data set that uses
> 50% of the disk> for a few hours and then run mongo again without
> deleting the stress.sh data set.

Hm.

> The 2.4.20 default is great on a clean FS but breaks down over time,
> just like the 2.4.19 allocator did.  Various people have demonstrated it
> with benchmarks.

Yes, and this is sad. But it appars that almost every FS suffers this problem ;)

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 18:04     ` Oleg Drokin
@ 2003-03-11 19:00       ` Chris Mason
  2003-03-11 21:51         ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-11 19:00 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: reiserfs-list

On Tue, 2003-03-11 at 13:04, Oleg Drokin wrote:
> Hello!
> 
> On Tue, Mar 11, 2003 at 12:32:48PM -0500, Chris Mason wrote:
> > > > changes blocknrs_and_prealloc_arrays_from_search_start into three
> > > > passes.  pass1 goes from the hint to the end of the disk, pass2 goes
> > > > from the border to the hint, and pass3 goes from the start of the disk
> > > > to the border.
> > > As you probably remember, we decided to drop border stiff all together
> > > because of all the extra seeking it incurrs.
> > The border does do extra seeks for some cases (search_reada helps), but
> > no border at all spreads tree blocks all over.  That too does a lot of
> 
> I'd say that no border makes tree blocks to appear near file data locations
> (at least at file time creation, items might be shifted away later).
> 

Well, we know it puts them near some file, but many tree nodes point to
more than one file (especially as you go higher in the tree).  I'm not
really sure if there is a good spot on the disk for them, it seems like
the leaves would benefit the most from being next to the file data they
point to, except for directory item leaves, which should be near the
stat data they point to.

But, my sense is that spreading them over the disk usually puts tree
nodes far apart from other tree nodes, and the extra seeking is why you
can see the performance difference from debugreiserfs.

> > > > Overall, I believe this will significantly improve fragmentation over
> > > > time.  oid_groups should only be used if your FS has a small number of
> > > I hope we won't have read-access speed degradation with these.
> > It does, but so does skip_busy alone.  You don't see the problem with
> 
> But we save on cpu here, I think. No?
> I am surprised this is noticeable at all.
> 

It really depends on the working data set.  If the new things you are
creating are roughly the same size as the holes from stuff you've
deleted, things tend to work out and skip_busy doesn't do too badly. 

This is especially true when your dataset includes lots of files < 64k
or so, since you tend to get a somewhat fragmented first 16k, followed
by two or three chunks of 9 blocks thanks to preallocation.  The fibmap
histogram shows this kind of thing nicely.

As a test, I did a stress.sh -n 20 -s and let it run for a few
iterations.  This filled my disk roughly 20%.

Then I created two 500MB files with dd and measured the fragmentation on
those files.  With skip_busy the 500MB files were 30% fragmented.  With
dirid_groups the 500MB files were 2% fragmented.

> > skip_busy during a mongo run, but run stress.sh -n 1 <data set that uses
> > 50% of the disk> for a few hours and then run mongo again without
> > deleting the stress.sh data set.
> 
> Hm.
> 

Sorry, that should be stress.sh -n 2 or higher.

> > The 2.4.20 default is great on a clean FS but breaks down over time,
> > just like the 2.4.19 allocator did.  Various people have demonstrated it
> > with benchmarks.
> 
> Yes, and this is sad. But it appars that almost every FS suffers this problem ;)

Very true.  I'm hoping we can improve things slightly though ;-)

-chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 19:00       ` Chris Mason
@ 2003-03-11 21:51         ` Hans Reiser
  0 siblings, 0 replies; 42+ messages in thread
From: Hans Reiser @ 2003-03-11 21:51 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Tue, 2003-03-11 at 13:04, Oleg Drokin wrote:
>  
>
>>Hello!
>>
>>On Tue, Mar 11, 2003 at 12:32:48PM -0500, Chris Mason wrote:
>>    
>>
>>>>>changes blocknrs_and_prealloc_arrays_from_search_start into three
>>>>>passes.  pass1 goes from the hint to the end of the disk, pass2 goes
>>>>>from the border to the hint, and pass3 goes from the start of the disk
>>>>>to the border.
>>>>>          
>>>>>
>>>>As you probably remember, we decided to drop border stiff all together
>>>>because of all the extra seeking it incurrs.
>>>>        
>>>>
>>>The border does do extra seeks for some cases (search_reada helps), but
>>>no border at all spreads tree blocks all over.  That too does a lot of
>>>      
>>>
>>I'd say that no border makes tree blocks to appear near file data locations
>>(at least at file time creation, items might be shifted away later).
>>
>>    
>>
>
>Well, we know it puts them near some file, but many tree nodes point to
>more than one file (especially as you go higher in the tree).  I'm not
>really sure if there is a good spot on the disk for them, it seems like
>the leaves would benefit the most from being next to the file data they
>point to, except for directory item leaves, which should be near the
>stat data they point to.
>
>But, my sense is that spreading them over the disk usually puts tree
>nodes far apart from other tree nodes, and the extra seeking is why you
>can see the performance difference from debugreiserfs.
>
>  
>
>>>>>Overall, I believe this will significantly improve fragmentation over
>>>>>time.  oid_groups should only be used if your FS has a small number of
>>>>>          
>>>>>
>>>>I hope we won't have read-access speed degradation with these.
>>>>        
>>>>
>>>It does, but so does skip_busy alone.  You don't see the problem with
>>>      
>>>
>>But we save on cpu here, I think. No?
>>I am surprised this is noticeable at all.
>>
>>    
>>
>
>It really depends on the working data set.  If the new things you are
>creating are roughly the same size as the holes from stuff you've
>deleted, things tend to work out and skip_busy doesn't do too badly. 
>
>This is especially true when your dataset includes lots of files < 64k
>or so, since you tend to get a somewhat fragmented first 16k, followed
>by two or three chunks of 9 blocks thanks to preallocation.  The fibmap
>histogram shows this kind of thing nicely.
>
>As a test, I did a stress.sh -n 20 -s and let it run for a few
>iterations.  This filled my disk roughly 20%.
>
>Then I created two 500MB files with dd and measured the fragmentation on
>those files.  With skip_busy the 500MB files were 30% fragmented.  With
>dirid_groups the 500MB files were 2% fragmented.
>
>  
>
>>>skip_busy during a mongo run, but run stress.sh -n 1 <data set that uses
>>>50% of the disk> for a few hours and then run mongo again without
>>>deleting the stress.sh data set.
>>>      
>>>
>>Hm.
>>
>>    
>>
>
>Sorry, that should be stress.sh -n 2 or higher.
>
>  
>
>>>The 2.4.20 default is great on a clean FS but breaks down over time,
>>>just like the 2.4.19 allocator did.  Various people have demonstrated it
>>>with benchmarks.
>>>      
>>>
>>Yes, and this is sad. But it appars that almost every FS suffers this problem ;)
>>    
>>
>
>Very true.  I'm hoping we can improve things slightly though ;-)
>
>-chris
>
>
>
>
>
>
>  
>
Let me just add a few words of encouragement.  Perform lots of 
measurements.  They tend to surprise.  Second, it might be that what is 
best depends on the workload, in which case making an option out of it 
is good.

As for why I chose the defaults that I did, which were to do nothing 
complicated except for jeff/oleg's skipping of full bitmaps, keeping 
things simple was the decider wherever nothing was clearcut.

More data would be interesting...., and I will defer judgement on the 
patches until I see empirics.

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 17:32   ` Chris Mason
  2003-03-11 18:04     ` Oleg Drokin
@ 2003-03-11 21:42     ` Hans Reiser
  2003-03-11 22:25       ` Chris Mason
  2003-03-12  7:14       ` Oleg Drokin
  1 sibling, 2 replies; 42+ messages in thread
From: Hans Reiser @ 2003-03-11 21:42 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Tue, 2003-03-11 at 11:42, Oleg Drokin wrote:
>  
>
>>Hello!
>>
>>On Tue, Mar 11, 2003 at 11:34:43AM -0500, Chris Mason wrote:
>>
>>    
>>
>>>changes blocknrs_and_prealloc_arrays_from_search_start into three
>>>passes.  pass1 goes from the hint to the end of the disk, pass2 goes
>>>from the border to the hint, and pass3 goes from the start of the disk
>>>to the border.
>>>      
>>>
>>As you probably remember, we decided to drop border stiff all together
>>because of all the extra seeking it incurrs.
>>
>>    
>>
>
>The border does do extra seeks for some cases (search_reada helps), but
>no border at all spreads tree blocks all over.  That too does a lot of
>seeks, since leaves and the formatted nodes that point to them might be
>on entirely different areas of the disk.
>  
>  
>
>>>Overall, I believe this will significantly improve fragmentation over
>>>time.  oid_groups should only be used if your FS has a small number of
>>>      
>>>
>>I hope we won't have read-access speed degradation with these.
>>    
>>
>
>It does, but so does skip_busy alone.  You don't see the problem with
>skip_busy during a mongo run, but run stress.sh -n 1 <data set that uses
>50% of the disk> for a few hours and then run mongo again without
>deleting the stress.sh data set.
>
>The 2.4.20 default is great on a clean FS but breaks down over time,
>just like the 2.4.19 allocator did.  Various people have demonstrated it
>with benchmarks.
>
>-chris
>
>
>
>
>  
>
Chris, don't you think the right answer would be to take zam's resizer 
and make a defragmenter out of it?

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 21:42     ` Hans Reiser
@ 2003-03-11 22:25       ` Chris Mason
  2003-03-11 22:39         ` Anders Widman
  2003-03-11 22:46         ` Hans Reiser
  2003-03-12  7:14       ` Oleg Drokin
  1 sibling, 2 replies; 42+ messages in thread
From: Chris Mason @ 2003-03-11 22:25 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Tue, 2003-03-11 at 16:42, Hans Reiser wrote:

> Chris, don't you think the right answer would be to take zam's resizer 
> and make a defragmenter out of it?

Yes and no, for a defrag program to fix things we'd have to agree on an
optimal layout ;-)  Also it assumes the machine has idle time when a
defragment cycle is possible.  For many servers this is entirely
untrue...the oracle boxes I ran didn't have a spare second for something
like a defrag.

We can all agree that fragmentation is bad, but the real question is how
do we group the blocks.  Lets pretend for a minute that fragmentation
isn't an issue at all, and our allocator is perfect.

The optimal grouping for reading/writing files is to have the files you
are going to read/write together in the same area of the disk.

The current default uses the start of the disk as a starting point for
each new file.  This roughly translates to files that are created
together end up in the same part of the disk.  As long as you always
access files in roughly the same order that you create them, it performs
pretty well.

But if a process creates dirA/file1 and then dirB/file2, file1 and file2
are going to be together on the disk.  If file1 tends to be used along
with all the other files in dirA, performance will suffer because we've
got to seek from all the other files in dirA over to file1.

And this is what we see over time, our performance decreases as people
add files onto their directories and shift things around.  Especially on
multi-user systems files are rarely accessed in the same order they were
created.

What we need is a knob for the admin to use to suggest 'I'm probably
going to access these files together'.  The only one I can think if is
the directory itself, but it isn't optimal either since subdirectories
are frequently accessed with their parents and with other subdirs.

-chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 22:25       ` Chris Mason
@ 2003-03-11 22:39         ` Anders Widman
  2003-03-11 22:54           ` Hans Reiser
  2003-03-11 22:46         ` Hans Reiser
  1 sibling, 1 reply; 42+ messages in thread
From: Anders Widman @ 2003-03-11 22:39 UTC (permalink / raw)
  To: Chris Mason; +Cc: Hans Reiser, Oleg Drokin, reiserfs-list

> On Tue, 2003-03-11 at 16:42, Hans Reiser wrote:

>> Chris, don't you think the right answer would be to take zam's resizer
>> and make a defragmenter out of it?

> Yes and no, for a defrag program to fix things we'd have to agree on an
> optimal layout ;-)  Also it assumes the machine has idle time when a
> defragment cycle is possible.  For many servers this is entirely
> untrue...the oracle boxes I ran didn't have a spare second for something
> like a defrag.

True,  But  then the server is to heavily loaded and you have probably
other issues you need to attend to?

> We can all agree that fragmentation is bad, but the real question is how
> do we group the blocks.  Lets pretend for a minute that fragmentation
> isn't an issue at all, and our allocator is perfect.

If that only was true :)

> The optimal grouping for reading/writing files is to have the files you
> are going to read/write together in the same area of the disk.

> The current default uses the start of the disk as a starting point for
> each new file.  This roughly translates to files that are created
> together end up in the same part of the disk.  As long as you always
> access files in roughly the same order that you create them, it performs
> pretty well.

> But if a process creates dirA/file1 and then dirB/file2, file1 and file2
> are going to be together on the disk.  If file1 tends to be used along
> with all the other files in dirA, performance will suffer because we've
> got to seek from all the other files in dirA over to file1.

> And this is what we see over time, our performance decreases as people
> add files onto their directories and shift things around.  Especially on
> multi-user systems files are rarely accessed in the same order they were
> created.

> What we need is a knob for the admin to use to suggest 'I'm probably
> going to access these files together'.  The only one I can think if is
> the directory itself, but it isn't optimal either since subdirectories
> are frequently accessed with their parents and with other subdirs.

> -chris

Would  it not be possible to have a daemon running and collecting data
on   how   files  are  being  accessed  and  written.  Then  either  a
defragmenter or the fs itself can optimize the on-disk layout?

Also,  would  it  be  possible  to  implement  some  form  of  command
reordering  in  the  fs? It might increase performance where there are
multiple accesses to smaller files on the same volume...

//Anders


   



--------
PGP public key: https://tnonline.net/secure/pgp_key.txt


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 22:39         ` Anders Widman
@ 2003-03-11 22:54           ` Hans Reiser
  2003-03-11 23:19             ` Anders Widman
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-11 22:54 UTC (permalink / raw)
  To: Anders Widman; +Cc: Chris Mason, Oleg Drokin, reiserfs-list

Anders Widman wrote:

>Would  it not be possible to have a daemon running and collecting data
>on   how   files  are  being  accessed  and  written.  Then  either  a
>defragmenter or the fs itself can optimize the on-disk layout?
>

There is an excellent Usenix paper on this that Nikita found, Nikita can 
you find it again?

I would love to have that approach built into reiser4, I think it would 
kick the butt of any other approach.

>
>Also,  would  it  be  possible  to  implement  some  form  of  command
>reordering  in  the  fs? It might increase performance where there are
>multiple accesses to smaller files on the same volume...
>
Can you define where that is better than letting the disk driver do the 
reordering?  (I am not saying yes or no, it is a genuine question.)

>
>//Anders
>
>
>   
>
>
>
>--------
>PGP public key: https://tnonline.net/secure/pgp_key.txt
>
>
>
>  
>


-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 22:54           ` Hans Reiser
@ 2003-03-11 23:19             ` Anders Widman
  2003-03-12  7:15               ` Oleg Drokin
  0 siblings, 1 reply; 42+ messages in thread
From: Anders Widman @ 2003-03-11 23:19 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Chris Mason, Oleg Drokin, reiserfs-list

>>Would  it not be possible to have a daemon running and collecting data
>>on   how   files  are  being  accessed  and  written.  Then  either  a
>>defragmenter or the fs itself can optimize the on-disk layout?
>>
> 
> There is an excellent Usenix paper on this that Nikita found, Nikita can 
> you find it again?
> 
> I would love to have that approach built into reiser4, I think it would 
> kick the butt of any other approach.

>>
>>Also,  would  it  be  possible  to  implement  some  form  of  command
>>reordering  in  the  fs? It might increase performance where there are
>>multiple accesses to smaller files on the same volume...
>>
> Can you define where that is better than letting the disk driver do the
> reordering?  (I am not saying yes or no, it is a genuine question.)

1) Not  all device drivers have command queuing and reordering. Mainly
   on  IDE  systems there is little support for TCQ - At least at this
   moment.  Also,  there  might  be cases where users run over devices
   other  than hard drives or on logical volumes such as MD or LVM. If
   writing and reading operations were aware of the physical layout of
   a  spanned volume, then speed under multi user environment could be
   considerably enhanced.

2) The  disk  driver  has  no  knowledge  of  the above filesystem and
   its  files.  In  case  of several concurrent writes of lots smaller
   files  the  fs  could  allow for caching them and write them out in
   sequence instead of parallel. Space might be more efficiently used,
   and  fragments avoided to some extent. It could perhaps be combined
   with the usage logging deamon suggested in earlier mail.

Maybe I am totally off here :)

//Anders

--------
PGP public key: https://tnonline.net/secure/pgp_key.txt

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 23:19             ` Anders Widman
@ 2003-03-12  7:15               ` Oleg Drokin
  0 siblings, 0 replies; 42+ messages in thread
From: Oleg Drokin @ 2003-03-12  7:15 UTC (permalink / raw)
  To: Anders Widman; +Cc: Hans Reiser, Chris Mason, reiserfs-list

Hello!

On Wed, Mar 12, 2003 at 12:19:37AM +0100, Anders Widman wrote:
    
> 2) The  disk  driver  has  no  knowledge  of  the above filesystem and
>    its  files.  In  case  of several concurrent writes of lots smaller
>    files  the  fs  could  allow for caching them and write them out in
>    sequence instead of parallel. Space might be more efficiently used,

Hm, I thought that's what we have page cache for ;)

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 22:25       ` Chris Mason
  2003-03-11 22:39         ` Anders Widman
@ 2003-03-11 22:46         ` Hans Reiser
  2003-03-12  1:48           ` Chris Mason
  1 sibling, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-11 22:46 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Tue, 2003-03-11 at 16:42, Hans Reiser wrote:
>
>  
>
>>Chris, don't you think the right answer would be to take zam's resizer 
>>and make a defragmenter out of it?
>>    
>>
>
>Yes and no, for a defrag program to fix things we'd have to agree on an
>optimal layout ;-)  Also it assumes the machine has idle time when a
>defragment cycle is possible. 
>
No, it assumes that 80% of files don't move during the course of a week, 
so if defrag takes a week, it still adds value.

> For many servers this is entirely
>untrue...the oracle boxes I ran didn't have a spare second for something
>like a defrag.
>
>We can all agree that fragmentation is bad, but the real question is how
>do we group the blocks.  Lets pretend for a minute that fragmentation
>isn't an issue at all, and our allocator is perfect.
>
>The optimal grouping for reading/writing files is to have the files you
>are going to read/write together in the same area of the disk.
>
>The current default uses the start of the disk as a starting point for
>each new file.
>
No, it uses the left neighbor in the tree.  Please correct me if I am 
wrong, because if I am wrong we have a bug.

>  This roughly translates to files that are created
>together end up in the same part of the disk.  As long as you always
>access files in roughly the same order that you create them, it performs
>pretty well.
>
>But if a process creates dirA/file1 and then dirB/file2, file1 and file2
>are going to be together on the disk.  If file1 tends to be used along
>with all the other files in dirA, performance will suffer because we've
>got to seek from all the other files in dirA over to file1.
>
If I understand your intended statement, you meant to say

If file1 tends to be used along
with all the other files in dirA, performance will suffer because we've
got to seek over all  other files in dirB when going from file1 to the next file in dirA..

>
>And this is what we see over time, our performance decreases as people
>add files onto their directories and shift things around.  Especially on
>multi-user systems files are rarely accessed in the same order they were
>created.
>
>What we need is a knob for the admin to use to suggest 'I'm probably
>going to access these files together'.  The only one I can think if is
>the directory itself, but it isn't optimal either since subdirectories
>are frequently accessed with their parents and with other subdirs.
>
In 1994, we realized that putting the grandparent directory into the key 
was infeasible, and decided we would just leave it for some future 
repacker to try to locate subdirectories of the same directory 
together.  We decided that locating files within the same directory near 
each other was good enough.  I still think this is correct.

>
>-chris
>
>
>
>  
>


-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 22:46         ` Hans Reiser
@ 2003-03-12  1:48           ` Chris Mason
  2003-03-12  7:12             ` Oleg Drokin
  2003-03-12 11:12             ` Hans Reiser
  0 siblings, 2 replies; 42+ messages in thread
From: Chris Mason @ 2003-03-12  1:48 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Tue, 2003-03-11 at 17:46, Hans Reiser wrote:
> Chris Mason wrote:
> 
> >On Tue, 2003-03-11 at 16:42, Hans Reiser wrote:
> >
> >  
> >
> >>Chris, don't you think the right answer would be to take zam's resizer 
> >>and make a defragmenter out of it?
> >>    
> >>
> >
> >Yes and no, for a defrag program to fix things we'd have to agree on an
> >optimal layout ;-)  Also it assumes the machine has idle time when a
> >defragment cycle is possible. 
> >
> No, it assumes that 80% of files don't move during the course of a week, 
> so if defrag takes a week, it still adds value.
> 
I ran a number of systems where this wasn't true.  Spool files are
constantly appended to or deleted by pop/imap servers.  Idle time is
important, between normal operations and backups it is almost impossible
to find a time when big servers have disk bandwidth to spare.

I like the idea of a dynamic in-kernel fragmentation tool though, you
mark a file as being in need of reallocation, and it happens before io
or something (hand waving is fun).

> > For many servers this is entirely
> >untrue...the oracle boxes I ran didn't have a spare second for something
> >like a defrag.
> >
> >We can all agree that fragmentation is bad, but the real question is how
> >do we group the blocks.  Lets pretend for a minute that fragmentation
> >isn't an issue at all, and our allocator is perfect.
> >
> >The optimal grouping for reading/writing files is to have the files you
> >are going to read/write together in the same area of the disk.
> >
> >The current default uses the start of the disk as a starting point for
> >each new file.
> >
> No, it uses the left neighbor in the tree.  Please correct me if I am 
> wrong, because if I am wrong we have a bug.
> 

It uses the left neighbor in the tree once the file has a starting
block.  But the first starting block comes by searching from block #0.

> In 1994, we realized that putting the grandparent directory into the key 
> was infeasible, and decided we would just leave it for some future 
> repacker to try to locate subdirectories of the same directory 
> together.  We decided that locating files within the same directory near 
> each other was good enough.  I still think this is correct.

I agree, the grandparent alone doesn't help...it's a recursive problem.

-chris



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12  1:48           ` Chris Mason
@ 2003-03-12  7:12             ` Oleg Drokin
  2003-03-12 13:31               ` Chris Mason
  2003-03-12 11:12             ` Hans Reiser
  1 sibling, 1 reply; 42+ messages in thread
From: Oleg Drokin @ 2003-03-12  7:12 UTC (permalink / raw)
  To: Chris Mason; +Cc: Hans Reiser, reiserfs-list

Hello!

On Tue, Mar 11, 2003 at 08:48:59PM -0500, Chris Mason wrote:

> > >The current default uses the start of the disk as a starting point for
> > >each new file.
> > No, it uses the left neighbor in the tree.  Please correct me if I am 
> > wrong, because if I am wrong we have a bug.
> It uses the left neighbor in the tree once the file has a starting
> block.  But the first starting block comes by searching from block #0.

No, this is not correct.
We look at the left neighbor.
See the get_left_neighbor() in bitmap.c,
first we make the search_start to be a block number of last path
element (if there is path at all of course which may be not there in some
cases), ans then check if we are allocating indirect block and if there is
non zero element in current indirect item in path, if there is non-zero element,
we take it as new search_start, otherwise it still contains blocknumber of
left neighbor.

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12  7:12             ` Oleg Drokin
@ 2003-03-12 13:31               ` Chris Mason
  2003-03-12 14:00                 ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-12 13:31 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: Hans Reiser, reiserfs-list

On Wed, 2003-03-12 at 02:12, Oleg Drokin wrote:

> > > No, it uses the left neighbor in the tree.  Please correct me if I am 
> > > wrong, because if I am wrong we have a bug.
> > It uses the left neighbor in the tree once the file has a starting
> > block.  But the first starting block comes by searching from block #0.
> 
> No, this is not correct.
> We look at the left neighbor.
> See the get_left_neighbor() in bitmap.c,
> first we make the search_start to be a block number of last path
> element (if there is path at all of course which may be not there in some
> cases), ans then check if we are allocating indirect block and if there is
> non zero element in current indirect item in path, if there is non-zero element,
> we take it as new search_start, otherwise it still contains blocknumber of
> left neighbor.

Right, I missed the hint->search_start = bh->b_blocknr for when we don't
find an indirect item.

This is better, so the search will start from the bh with the stat data
for a new file, but it depends on the last path element being in a good
position relative to the rest of the files in the directory.

-chris





^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 13:31               ` Chris Mason
@ 2003-03-12 14:00                 ` Hans Reiser
  2003-03-12 14:05                   ` Oleg Drokin
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-12 14:00 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Wed, 2003-03-12 at 02:12, Oleg Drokin wrote:
>
>  
>
>>>>No, it uses the left neighbor in the tree.  Please correct me if I am 
>>>>wrong, because if I am wrong we have a bug.
>>>>        
>>>>
>>>It uses the left neighbor in the tree once the file has a starting
>>>block.  But the first starting block comes by searching from block #0.
>>>      
>>>
>>No, this is not correct.
>>We look at the left neighbor.
>>See the get_left_neighbor() in bitmap.c,
>>first we make the search_start to be a block number of last path
>>element (if there is path at all of course which may be not there in some
>>cases), ans then check if we are allocating indirect block and if there is
>>non zero element in current indirect item in path, if there is non-zero element,
>>we take it as new search_start, otherwise it still contains blocknumber of
>>left neighbor.
>>    
>>
>
>Right, I missed the hint->search_start = bh->b_blocknr for when we don't
>find an indirect item.
>
>This is better, so the search will start from the bh with the stat data
>for a new file, but it depends on the last path element being in a good
>position relative to the rest of the files in the directory.
>
Where other than near the stat data do you propose?

>
>-chris
>
>
>
>
>
>
>  
>


-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 14:00                 ` Hans Reiser
@ 2003-03-12 14:05                   ` Oleg Drokin
  2003-03-12 14:08                     ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Oleg Drokin @ 2003-03-12 14:05 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Chris Mason, reiserfs-list

Hello!

On Wed, Mar 12, 2003 at 05:00:32PM +0300, Hans Reiser wrote:

> >Right, I missed the hint->search_start = bh->b_blocknr for when we don't
> >find an indirect item.
> >This is better, so the search will start from the bh with the stat data
> >for a new file, but it depends on the last path element being in a good
> >position relative to the rest of the files in the directory.
> Where other than near the stat data do you propose?

The permanent problem is that metadata may move inside of the tree from one
block to another.
So after some time, the blockin which stat data was inserted may move away from
optimal file location.

Bye,
    Oleg 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 14:05                   ` Oleg Drokin
@ 2003-03-12 14:08                     ` Hans Reiser
  2003-03-12 14:17                       ` Oleg Drokin
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-12 14:08 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: Chris Mason, reiserfs-list

Oleg Drokin wrote:

>Hello!
>
>On Wed, Mar 12, 2003 at 05:00:32PM +0300, Hans Reiser wrote:
>
>  
>
>>>Right, I missed the hint->search_start = bh->b_blocknr for when we don't
>>>find an indirect item.
>>>This is better, so the search will start from the bh with the stat data
>>>for a new file, but it depends on the last path element being in a good
>>>position relative to the rest of the files in the directory.
>>>      
>>>
>>Where other than near the stat data do you propose?
>>    
>>
>
>The permanent problem is that metadata may move inside of the tree from one
>block to another.
>So after some time, the blockin which stat data was inserted may move away from
>optimal file location.
>
>Bye,
>    Oleg 
>
>
>  
>
Sounds like you need a repacker and allocate on flush....

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 14:08                     ` Hans Reiser
@ 2003-03-12 14:17                       ` Oleg Drokin
  2003-03-12 19:22                         ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Oleg Drokin @ 2003-03-12 14:17 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Chris Mason, reiserfs-list

Hello!

On Wed, Mar 12, 2003 at 05:08:21PM +0300, Hans Reiser wrote:
> >>>Right, I missed the hint->search_start = bh->b_blocknr for when we don't
> >>>find an indirect item.
> >>>This is better, so the search will start from the bh with the stat data
> >>>for a new file, but it depends on the last path element being in a good
> >>>position relative to the rest of the files in the directory.
> >>Where other than near the stat data do you propose?
> >The permanent problem is that metadata may move inside of the tree from one
> >block to another.
> >So after some time, the blockin which stat data was inserted may move away 
> >from
> >optimal file location.
> Sounds like you need a repacker and allocate on flush....

True for repacker. I do not see for allocate on flush will help.
I see how relocate on flush may help ;)

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 14:17                       ` Oleg Drokin
@ 2003-03-12 19:22                         ` Hans Reiser
  2003-03-13  6:11                           ` Oleg Drokin
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-12 19:22 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: Chris Mason, reiserfs-list

Oleg Drokin wrote:

>Hello!
>
>On Wed, Mar 12, 2003 at 05:08:21PM +0300, Hans Reiser wrote:
>  
>
>>>>>Right, I missed the hint->search_start = bh->b_blocknr for when we don't
>>>>>find an indirect item.
>>>>>This is better, so the search will start from the bh with the stat data
>>>>>for a new file, but it depends on the last path element being in a good
>>>>>position relative to the rest of the files in the directory.
>>>>>          
>>>>>
>>>>Where other than near the stat data do you propose?
>>>>        
>>>>
>>>The permanent problem is that metadata may move inside of the tree from one
>>>block to another.
>>>So after some time, the blockin which stat data was inserted may move away 
>>>from
>>>optimal file location.
>>>      
>>>
>>Sounds like you need a repacker and allocate on flush....
>>    
>>
>
>True for repacker. I do not see for allocate on flush will help.
>I see how relocate on flush may help ;)
>
>Bye,
>    Oleg
>
>
>  
>
If you don't allocate it until flush, you don't need to reallocate at 
flush time...;-)

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 19:22                         ` Hans Reiser
@ 2003-03-13  6:11                           ` Oleg Drokin
  2003-03-13 12:06                             ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Oleg Drokin @ 2003-03-13  6:11 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Chris Mason, reiserfs-list

Hello!

On Wed, Mar 12, 2003 at 10:22:49PM +0300, Hans Reiser wrote:

> >True for repacker. I do not see for allocate on flush will help.
> >I see how relocate on flush may help ;)
> If you don't allocate it until flush, you don't need to reallocate at 
> flush time...;-)

But what if we have some free space in already existing node, and new file information
perfectly fits in there?

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-13  6:11                           ` Oleg Drokin
@ 2003-03-13 12:06                             ` Hans Reiser
  2003-03-13 12:10                               ` Oleg Drokin
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-13 12:06 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: Chris Mason, reiserfs-list, Alexander Zarochentcev

Oleg Drokin wrote:

>Hello!
>
>On Wed, Mar 12, 2003 at 10:22:49PM +0300, Hans Reiser wrote:
>
>  
>
>>>True for repacker. I do not see for allocate on flush will help.
>>>I see how relocate on flush may help ;)
>>>      
>>>
>>If you don't allocate it until flush, you don't need to reallocate at 
>>flush time...;-)
>>    
>>
>
>But what if we have some free space in already existing node, and new file information
>perfectly fits in there?
>
>Bye,
>    Oleg
>
>
>  
>
We reallocate dirty nodes that are not in the most optimal location when 
we flush them.

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-13 12:06                             ` Hans Reiser
@ 2003-03-13 12:10                               ` Oleg Drokin
  0 siblings, 0 replies; 42+ messages in thread
From: Oleg Drokin @ 2003-03-13 12:10 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Chris Mason, reiserfs-list, Alexander Zarochentcev

Hello!

On Thu, Mar 13, 2003 at 03:06:33PM +0300, Hans Reiser wrote:
> >>>True for repacker. I do not see for allocate on flush will help.
> >>>I see how relocate on flush may help ;)
> >>If you don't allocate it until flush, you don't need to reallocate at 
> >>flush time...;-)
> >But what if we have some free space in already existing node, and new file 
> >information
> >perfectly fits in there?
> We reallocate dirty nodes that are not in the most optimal location when 
> we flush them.

Yes, that's what I am saying. Relocation on flush (for existing node).
Allocation on flush (for new nodes) won't help in above-described scenario.

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12  1:48           ` Chris Mason
  2003-03-12  7:12             ` Oleg Drokin
@ 2003-03-12 11:12             ` Hans Reiser
  2003-03-12 13:35               ` Chris Mason
  1 sibling, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-12 11:12 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Tue, 2003-03-11 at 17:46, Hans Reiser wrote:
>  
>
>>Chris Mason wrote:
>>
>>    
>>
>>>On Tue, 2003-03-11 at 16:42, Hans Reiser wrote:
>>>
>>> 
>>>
>>>      
>>>
>>>>Chris, don't you think the right answer would be to take zam's resizer 
>>>>and make a defragmenter out of it?
>>>>   
>>>>
>>>>        
>>>>
>>>Yes and no, for a defrag program to fix things we'd have to agree on an
>>>optimal layout ;-)  Also it assumes the machine has idle time when a
>>>defragment cycle is possible. 
>>>
>>>      
>>>
>>No, it assumes that 80% of files don't move during the course of a week, 
>>so if defrag takes a week, it still adds value.
>>
>>    
>>
>I ran a number of systems where this wasn't true.  Spool files are
>constantly appended to or deleted by pop/imap servers.  Idle time is
>important, between normal operations and backups it is almost impossible
>to find a time when big servers have disk bandwidth to spare.
>
>I like the idea of a dynamic in-kernel fragmentation tool though, you
>mark a file as being in need of reallocation, and it happens before io
>or something (hand waving is fun).
>
You mean you like the allocate on flush we do in reiser4?

Perhaps I should say only that for 80% of machines 80% of the files 
never move during the course of a week.;-)

allocate on flush will improve a lot of the cases, and an online 
repacker will improve a lot of the ones that allocate on flush does not 
improve.

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 11:12             ` Hans Reiser
@ 2003-03-12 13:35               ` Chris Mason
  2003-03-12 14:03                 ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-12 13:35 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Wed, 2003-03-12 at 06:12, Hans Reiser wrote:

> >I like the idea of a dynamic in-kernel fragmentation tool though, you
> >mark a file as being in need of reallocation, and it happens before io
> >or something (hand waving is fun).
> >
> You mean you like the allocate on flush we do in reiser4?
> 
> Perhaps I should say only that for 80% of machines 80% of the files 
> never move during the course of a week.;-)
> 
> allocate on flush will improve a lot of the cases, and an online 
> repacker will improve a lot of the ones that allocate on flush does not 
> improve.

None of which solves the question of how do we lay things out on disk
;-)

-chris



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 13:35               ` Chris Mason
@ 2003-03-12 14:03                 ` Hans Reiser
  0 siblings, 0 replies; 42+ messages in thread
From: Hans Reiser @ 2003-03-12 14:03 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Wed, 2003-03-12 at 06:12, Hans Reiser wrote:
>
>  
>
>>>I like the idea of a dynamic in-kernel fragmentation tool though, you
>>>mark a file as being in need of reallocation, and it happens before io
>>>or something (hand waving is fun).
>>>
>>>      
>>>
>>You mean you like the allocate on flush we do in reiser4?
>>
>>Perhaps I should say only that for 80% of machines 80% of the files 
>>never move during the course of a week.;-)
>>
>>allocate on flush will improve a lot of the cases, and an online 
>>repacker will improve a lot of the ones that allocate on flush does not 
>>improve.
>>    
>>
>
>None of which solves the question of how do we lay things out on disk
>;-)
>
>-chris
>
>
>
>
>  
>
Well, okay, so what are you proposing and what are your benchmarks?  
This is an area where the algorithms are simple but their consequences 
are just guesses until they are measured.

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 21:42     ` Hans Reiser
  2003-03-11 22:25       ` Chris Mason
@ 2003-03-12  7:14       ` Oleg Drokin
  1 sibling, 0 replies; 42+ messages in thread
From: Oleg Drokin @ 2003-03-12  7:14 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Chris Mason, reiserfs-list

Hello!

On Wed, Mar 12, 2003 at 12:42:37AM +0300, Hans Reiser wrote:

> Chris, don't you think the right answer would be to take zam's resizer 
> and make a defragmenter out of it?

Well, this should be in kernel, but current resizer (the one that can actually move
the blocks) requires unmounted fs.
Actually we can just create trivial interface for "take this block and move it there
atomically (through journal)", just like ext3 guys did. But that did not bought them
online defragmenter yet, I think ;) 

Bye,
    Oleg

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-11 16:42 ` Oleg Drokin
  2003-03-11 17:32   ` Chris Mason
@ 2003-03-12 19:57   ` Chris Mason
  2003-03-12 20:51     ` Hans Reiser
  1 sibling, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-12 19:57 UTC (permalink / raw)
  To: Oleg Drokin; +Cc: reiserfs-list

[ dirid based grouping spreads things out too much on the disk ]

So, trying to summarize so far, we know we want to group things on the
disk somehow, but we don't really have a good metric to use.  Using the
parent directory objectid as the grouping metric spreads things evenly
out on the disk, resulting in lots of seeks when we read a given
directory tree.  There are some cases where this breaks down badly,
like  lots of directories with just a few files each.

defragmenting tools are nice and all, but before we can write one we
actually have to pick a disk layout for the tool to create ;-)

The btree already has a metric for grouping, it's the packing locality. 
I realized this morning that we just aren't being very smart about how
we use it.  If one directory has 1 thousand subdirs with 1 file each,
should they all be in different packing localities?  probably not.

So I played around with the patch below, it tries to define a span over
which a given packing locality used.  When new files or directories are
created, their packing locality is chosen based on how much of the tree
the packing locality of their parent directory is using.

If the span is small, the packing locality of the parent dir is used as
the packing locality of the new file/dir.  If the span is large, the
objectid of the parent dir is used as the packing locality of the new
file/dir.

The patch below is really rough, but when used with the dirid_group
allocator, it performs the same as skip_busy alone for reading a
recently created directory.  I need to make sure it doesn't have the
same fragmentation problems as skip_busy alone though.

(this was against a data logging tree, it is meant for review only and
not to be applied)

--- linux.beta3.2/fs/reiserfs/inode.c	2003-03-12 14:32:41.000000000
-0500
+++ linux.beta3/fs/reiserfs/inode.c	2003-03-12 14:30:35.000000000 -0500
@@ -1646,6 +1646,8 @@
     struct cpu_key key;
     struct item_head ih;
     struct stat_data sd;
+    __u32 packing;
+    __u32 checked_packing = 0;
     int retval;
     int err ;
   
@@ -1663,9 +1665,12 @@
     if( S_ISLNK( inode -> i_mode ) )
 	    inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
 
-    /* item head of new item */
-    ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
+    packing = INODE_PKEY(dir)->k_dir_id;
     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid
(th));
+
+packing_restart:
+    /* item head of new item */
+    ih.ih_key.k_dir_id = packing;
     if (!ih.ih_key.k_objectid) {
 	err = -ENOMEM ;
 	goto out_bad_inode ;
@@ -1718,6 +1723,14 @@
 	err = -EEXIST;
 	goto out_bad_inode;
     }
+    if (!checked_packing) {
+	checked_packing = choose_packing(dir, &path_to_key);
+	if (checked_packing != packing) {
+	    packing = checked_packing;
+	    pathrelse(&path_to_key);
+	    goto packing_restart;
+	}
+    }
 
     if (old_format_only (sb)) {
 	if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
diff -ur linux.beta3.2/fs/reiserfs/stree.c
linux.beta3/fs/reiserfs/stree.c
--- linux.beta3.2/fs/reiserfs/stree.c	2003-03-12 14:32:41.000000000
-0500
+++ linux.beta3/fs/reiserfs/stree.c	2003-03-12 14:31:43.000000000 -0500
@@ -296,6 +296,56 @@
 /* Maximal possible key. It is never in the tree. */
 const struct key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff,
0xffffffff},}};
 
+static int packing_locality_span(struct super_block *sb, 
+                                 struct path *path, __u32 cpu_packing) 
+{
+    int path_offset;
+    int pos;
+    int levels = 1;
+    struct buffer_head *bh;
+    struct key *key;
+
+    path_offset = path->path_length;
+
+    while ( path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
+        pos = PATH_OFFSET_POSITION(path, path_offset);
+	bh = PATH_OFFSET_PBUFFER(path, path_offset);
+
+	if (!B_IS_IN_TREE(bh))
+	    break;
+
+	if (pos) {
+	    /* check left */
+	    key = B_N_PDELIM_KEY(bh, pos - 1);
+	    if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
+	        return levels;
+	    }
+	}
+	if (pos != B_NR_ITEMS(bh)) {
+	    key = B_N_PDELIM_KEY(bh, pos + 1);
+	    if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
+	        return levels;
+	    }
+	}
+        levels++;
+    }
+    return levels;
+}
+
+/* returns an le32 packing locality */
+__u32 choose_packing(struct inode *parent, struct path *path)
+{
+    __u32 grandparent;
+    int levels;
+
+    grandparent = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
+
+    levels = packing_locality_span(parent->i_sb, path, grandparent);
+    if (levels <= 2) {
+        return INODE_PKEY(parent)->k_dir_id;
+    }
+    return INODE_PKEY(parent)->k_objectid;
+}
 
 /* Get delimiting key of the buffer by looking for it in the buffers in
the path, starting from the bottom
    of the path, and going upwards.  We must check the path's validity
at each step.  If the key is not in




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 19:57   ` Chris Mason
@ 2003-03-12 20:51     ` Hans Reiser
  2003-03-13 15:59       ` Chris Mason
  2003-03-16 16:25       ` Anders Widman
  0 siblings, 2 replies; 42+ messages in thread
From: Hans Reiser @ 2003-03-12 20:51 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>[ dirid based grouping spreads things out too much on the disk ]
>
>So, trying to summarize so far, we know we want to group things on the
>disk somehow, but we don't really have a good metric to use.  Using the
>parent directory objectid as the grouping metric spreads things evenly
>out on the disk, resulting in lots of seeks when we read a given
>directory tree.  There are some cases where this breaks down badly,
>like  lots of directories with just a few files each.
>
>defragmenting tools are nice and all, but before we can write one we
>actually have to pick a disk layout for the tool to create ;-)
>
>The btree already has a metric for grouping, it's the packing locality. 
>I realized this morning that we just aren't being very smart about how
>we use it.  If one directory has 1 thousand subdirs with 1 file each,
>should they all be in different packing localities? 
>
> probably not.
>
I would also like an API for letting users/apps choose packing 
localities, that way an rpm package can all go into the same packing 
locality even though semantically it scatters files across ten 
directories.  What do you think?

>
>So I played around with the patch below, it tries to define a span over
>which a given packing locality used.  When new files or directories are
>created, their packing locality is chosen based on how much of the tree
>the packing locality of their parent directory is using.
>
This is cool.  Good heuristic!

>
>If the span is small, the packing locality of the parent dir is used as
>the packing locality of the new file/dir.  If the span is large, the
>objectid of the parent dir is used as the packing locality of the new
>file/dir.
>
>The patch below is really rough, but when used with the dirid_group
>allocator, it performs the same as skip_busy alone for reading a
>recently created directory.  I need to make sure it doesn't have the
>same fragmentation problems as skip_busy alone though.
>
>(this was against a data logging tree, it is meant for review only and
>not to be applied)
>
>--- linux.beta3.2/fs/reiserfs/inode.c	2003-03-12 14:32:41.000000000
>-0500
>+++ linux.beta3/fs/reiserfs/inode.c	2003-03-12 14:30:35.000000000 -0500
>@@ -1646,6 +1646,8 @@
>     struct cpu_key key;
>     struct item_head ih;
>     struct stat_data sd;
>+    __u32 packing;
>+    __u32 checked_packing = 0;
>     int retval;
>     int err ;
>   
>@@ -1663,9 +1665,12 @@
>     if( S_ISLNK( inode -> i_mode ) )
> 	    inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
> 
>-    /* item head of new item */
>-    ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
>+    packing = INODE_PKEY(dir)->k_dir_id;
>     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid
>(th));
>+
>+packing_restart:
>+    /* item head of new item */
>+    ih.ih_key.k_dir_id = packing;
>     if (!ih.ih_key.k_objectid) {
> 	err = -ENOMEM ;
> 	goto out_bad_inode ;
>@@ -1718,6 +1723,14 @@
> 	err = -EEXIST;
> 	goto out_bad_inode;
>     }
>+    if (!checked_packing) {
>+	checked_packing = choose_packing(dir, &path_to_key);
>+	if (checked_packing != packing) {
>+	    packing = checked_packing;
>+	    pathrelse(&path_to_key);
>+	    goto packing_restart;
>+	}
>+    }
> 
>     if (old_format_only (sb)) {
> 	if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
>diff -ur linux.beta3.2/fs/reiserfs/stree.c
>linux.beta3/fs/reiserfs/stree.c
>--- linux.beta3.2/fs/reiserfs/stree.c	2003-03-12 14:32:41.000000000
>-0500
>+++ linux.beta3/fs/reiserfs/stree.c	2003-03-12 14:31:43.000000000 -0500
>@@ -296,6 +296,56 @@
> /* Maximal possible key. It is never in the tree. */
> const struct key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff,
>0xffffffff},}};
> 
>+static int packing_locality_span(struct super_block *sb, 
>+                                 struct path *path, __u32 cpu_packing) 
>+{
>+    int path_offset;
>+    int pos;
>+    int levels = 1;
>+    struct buffer_head *bh;
>+    struct key *key;
>+
>+    path_offset = path->path_length;
>+
>+    while ( path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
>+        pos = PATH_OFFSET_POSITION(path, path_offset);
>+	bh = PATH_OFFSET_PBUFFER(path, path_offset);
>+
>+	if (!B_IS_IN_TREE(bh))
>+	    break;
>+
>+	if (pos) {
>+	    /* check left */
>+	    key = B_N_PDELIM_KEY(bh, pos - 1);
>+	    if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
>+	        return levels;
>+	    }
>+	}
>+	if (pos != B_NR_ITEMS(bh)) {
>+	    key = B_N_PDELIM_KEY(bh, pos + 1);
>+	    if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
>+	        return levels;
>+	    }
>+	}
>+        levels++;
>+    }
>+    return levels;
>+}
>+
>+/* returns an le32 packing locality */
>+__u32 choose_packing(struct inode *parent, struct path *path)
>+{
>+    __u32 grandparent;
>+    int levels;
>+
>+    grandparent = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
>+
>+    levels = packing_locality_span(parent->i_sb, path, grandparent);
>+    if (levels <= 2) {
>+        return INODE_PKEY(parent)->k_dir_id;
>+    }
>+    return INODE_PKEY(parent)->k_objectid;
>+}
> 
> /* Get delimiting key of the buffer by looking for it in the buffers in
>the path, starting from the bottom
>    of the path, and going upwards.  We must check the path's validity
>at each step.  If the key is not in
>
>
>
>
>
>  
>


-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 20:51     ` Hans Reiser
@ 2003-03-13 15:59       ` Chris Mason
  2003-03-14  0:15         ` Hans Reiser
  2003-03-16 16:25       ` Anders Widman
  1 sibling, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-13 15:59 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Wed, 2003-03-12 at 15:51, Hans Reiser wrote:
> Chris Mason wrote:
> 
> >[ dirid based grouping spreads things out too much on the disk ]
> >
> >The btree already has a metric for grouping, it's the packing locality. 
> >I realized this morning that we just aren't being very smart about how
> >we use it.  If one directory has 1 thousand subdirs with 1 file each,
> >should they all be in different packing localities? 
> >
> > probably not.
> >
> I would also like an API for letting users/apps choose packing 
> localities, that way an rpm package can all go into the same packing 
> locality even though semantically it scatters files across ten 
> directories.  What do you think?
> 
More knobs for the admin are good, something like setattr
INHERIT_PACKING dir

> >
> >So I played around with the patch below, it tries to define a span over
> >which a given packing locality used.  When new files or directories are
> >created, their packing locality is chosen based on how much of the tree
> >the packing locality of their parent directory is using.
> >
> This is cool.  Good heuristic!

Ok, more testing showed that patch wasn't bad, but wasn't great either. 
Under a multiprocess test, the buffers in the path would get moved away
and there wasn't enough data to make a good decision.  So the patch
below changes things around a little, and records the span during
search_by_key instead of trying to compute it after the search is over.

The end result is that if the keys in level 3 or higher of the tree have
the same packing locality in adjacent keys, we use the dirid as the
packing locality.  Otherwise we reuse the parent directory packing
locality.

On a 100MB subset of /usr/share/doc, this results in 4 packing
localities for 737 directories.  In combination with the dirid grouping
allocator, it does a nice job of preventing fragmentation in the
multi-writer and deletion cases.

The results are really interesting.  For reading a fresh copy of /usr,
skip_busy alone needs 1m50s.  dirid_groups needs 2m40s and the packing
locality code brings that down to 2m10s.  Not too bad against
skip_busy's best case.

When I use 10 procs to fill a directory tree, (~1GB of data total),
skip_busy alone needs 3m7s to read it, and the external fragmentation is
10%.

If I run the same test with the new packing locality code +
dirid_groups, it only takes 2m to read it and the fragmentation is 3%.

In other words, I think this is a really good compromise between the
current defaults and a more optimal case for fragmentation, and I expect
its performance to hold up much better as the filesystem ages.

The patch is below against 2.4.21-pre is below.  It should be considered
extremely risky and is really only meant for review by those interested.

(note, this changes the default mount alloc option to dirid_groups + 5%
border + skip_busy)

--- 1.18/fs/reiserfs/bitmap.c	Thu Sep 12 04:39:21 2002
+++ edited/fs/reiserfs/bitmap.c	Thu Mar 13 10:18:37 2003
@@ -33,6 +33,8 @@
 #define  _ALLOC_hashed_formatted_nodes 7
 #define  _ALLOC_old_way 8
 #define  _ALLOC_hundredth_slices 9
+#define  _ALLOC_dirid_groups 10
+#define  _ALLOC_oid_groups 11
 
 #define  concentrating_formatted_nodes(s)     test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
 #define  displacing_large_files(s)            test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
@@ -260,8 +262,18 @@
     get_bit_address (s, *start, &bm, &off);
     get_bit_address (s, finish, &end_bm, &end_off);
 
-    // With this option set first we try to find a bitmap that is at least 10%
-    // free, and if that fails, then we fall back to old whole bitmap scanning
+    /* When the bitmap is more than 10% free, anyone can allocate.
+     * When it's less than 10% free, only files that already use the
+     * bitmap are allowed. Once we pass 80% full, this restriction
+     * is lifted.
+     *
+     * We do this so that files that grow later still have space close to
+     * their original allocation. This improves locality, and presumably
+     * performance as a result.
+     *
+     * This is only an allocation policy and does not make up for getting a
+     * bad hint. Decent hinting must be implemented for this to work well.
+     */
     if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
 	for (;bm < end_bm; bm++, off = 0) {
 	    if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
@@ -392,6 +404,16 @@
   }
 }
 
+void reiserfs_init_alloc_options (struct super_block *s)
+{
+    set_bit (_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s));
+    set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+    set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+    s->u.reiserfs_sb.s_alloc_options.border = 20;
+
+    reiserfs_warning ("allocator defaults = [%08x]\n", SB_ALLOC_OPTS(s));
+}
+
 /* block allocator related options are parsed here */
 int reiserfs_parse_alloc_options(struct super_block * s, char * options)
 {
@@ -435,6 +457,15 @@
 	    continue;
 	}
 
+        if (!strcmp(this_char, "dirid_groups")) {
+	    SET_OPTION(dirid_groups);
+	    continue;
+        }
+        if (!strcmp(this_char, "oid_groups")) {
+	    SET_OPTION(oid_groups);
+	    continue;
+        }
+
 	if (!strcmp(this_char, "hashed_formatted_nodes")) {
 	    SET_OPTION(hashed_formatted_nodes);
 	    continue;
@@ -476,6 +507,7 @@
 	return 1;
     }
 
+    reiserfs_warning ("allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
     return 0;
 }
 
@@ -498,17 +530,87 @@
     hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
 }
 
-static void inline get_left_neighbor(reiserfs_blocknr_hint_t *hint)
+/* 
+ * Relocation based on dirid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them 
+ */
+static void 
+dirid_groups (reiserfs_blocknr_hint_t *hint)
+{
+    if (hint->inode) {
+        char * hash_in = NULL;
+	unsigned long hash;
+	unsigned long mask;
+	__u32 dirid;
+
+	/* effectively turns the disk in 64MB groups (4k blocksize), 
+	 * but the bigger your disk is the less likely hash collisions
+	 * are, leading to dynamically bigger groups based on 
+	 * your disk size
+	 */
+	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+	/* keep the root dir and it's first set of subdirs close to
+	 * the start of the disk
+	 */
+	if (dirid <= 2) {
+	    hash = 0;
+	} else {
+	    hash_in = (char *)(&dirid);
+	    hash = keyed_hash(hash_in, 4);
+	}
+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
+	hash = hint->beg + hash % (hint->end - hint->beg);
+	hash &= ~mask;
+
+	hint->search_start = hash;
+    }
+}
+
+/* 
+ * Relocation based on oid, hashing them into a given bitmap block
+ * files. Formatted nodes are unaffected, a seperate policy covers them 
+ */
+static void 
+oid_groups (reiserfs_blocknr_hint_t *hint)
+{
+    if (hint->inode) {
+        char * hash_in = NULL;
+	unsigned long hash;
+	unsigned long mask;
+	__u32 oid;
+
+	oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+	hash_in = (char *)(&oid);
+
+	/* effectively turns the disk in 64MB groups (4k blocksize), 
+	 * but the bigger your disk is the less likely hash collisions
+	 * are, leading to dynamically bigger groups based on 
+	 * your disk size
+	 */
+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
+	hash = keyed_hash(hash_in, 4);
+	hash = hint->beg + hash % (hint->end - hint->beg);
+	hash &= ~mask;
+
+	hint->search_start = hash;
+    }
+}
+
+/* returns 1 if it finds an indirect item and gets valid hint info
+ * from it, otherwise 0
+ */
+static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
 {
     struct path * path;
     struct buffer_head * bh;
     struct item_head * ih;
     int pos_in_item;
     __u32 * item;
+    int ret = 0;
 
     if (!hint->path)		/* reiserfs code can call this function w/o pointer to path
 				 * structure supplied; then we rely on supplied search_start */
-	return;
+	return 0;
 
     path = hint->path;
     bh = get_last_bh(path);
@@ -529,6 +631,7 @@
 	    int t=get_block_num(item,pos_in_item);
 	    if (t) {
 		hint->search_start = t;
+		ret = 1;
 		break;
 	    }
 	    pos_in_item --;
@@ -537,7 +640,7 @@
     }
 
     /* does result value fit into specified region? */
-    return;
+    return ret;
 }
 
 /* should be, if formatted node, then try to put on first part of the device
@@ -640,6 +743,7 @@
     struct super_block *s = hint->th->t_super;
     hint->beg = 0;
     hint->end = SB_BLOCK_COUNT(s) - 1;
+    int unfm_hint;
 
     /* This is former border algorithm. Now with tunable border offset */
     if (concentrating_formatted_nodes(s))
@@ -668,19 +772,14 @@
 	return;
     }
 
-    /* attempt to copy a feature from old block allocator code */
-    if (TEST_OPTION(old_hashed_relocation, s) && !hint->formatted_node) {
-	old_hashed_relocation(hint);
-    }
-
     /* if none of our special cases is relevant, use the left neighbor in the
        tree order of the new node we are allocating for */
     if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
-	hash_formatted_node(hint);
+        hash_formatted_node(hint);
 	return;
-    } 
+    }
 
-    get_left_neighbor(hint);
+    unfm_hint = get_left_neighbor(hint);
 
     /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
        new blocks are displaced based on directory ID. Also, if suggested search_start
@@ -705,10 +804,29 @@
 	return;
     }
 
-    if (TEST_OPTION(old_hashed_relocation, s))
+    /* old_hashed_relocation only works on unformatted */
+    if (!unfm_hint && !hint->formatted_node &&
+        TEST_OPTION(old_hashed_relocation, s))
+    {
 	old_hashed_relocation(hint);
-    if (TEST_OPTION(new_hashed_relocation, s))
+    }
+    /* new_hashed_relocation works with both formatted/unformatted nodes */
+    if ((!unfm_hint || hint->formatted_node) &&
+        TEST_OPTION(new_hashed_relocation, s))
+    {
 	new_hashed_relocation(hint);
+    }
+    /* dirid grouping works only on unformatted nodes */
+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
+    {
+        dirid_groups(hint);
+    }
+
+    /* oid grouping works only on unformatted nodes */
+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
+    {
+        oid_groups(hint);
+    }
     return;
 }
 
@@ -772,28 +890,35 @@
     struct super_block *s = hint->th->t_super;
     b_blocknr_t start = hint->search_start;
     b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
-    int second_pass = 0;
+    int passno = 0;
     int nr_allocated = 0;
 
     determine_prealloc_size(hint);
-    while((nr_allocated
-	  += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
-					  amount_needed - nr_allocated, hint->prealloc_size))
-	  < amount_needed) {
-
-	/* not all blocks were successfully allocated yet*/
-	if (second_pass) {	/* it was a second pass; we must free all blocks */
+    do {
+	switch (passno++) {
+        case 0: /* Search from hint->search_start to end of disk */
+            start = hint->search_start;
+            finish = SB_BLOCK_COUNT(s) - 1;
+            break;
+        case 1: /* Search from hint->beg to hint->search_start */
+            start = hint->beg;
+            finish = hint->search_start;
+            break;
+        case 2: /* Last chance: Search from 0 to hint->beg */
+            start = 0;
+            finish = hint->beg;
+	    break;
+        default: /* We've tried searching everywhere, not enough space */
+	    /* not all blocks were successfully allocated yet*/
 	    while (nr_allocated --)
 		reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
 
 	    return NO_DISK_SPACE;
-	} else {		/* refine search parameters for next pass */
-	    second_pass = 1;
-	    finish = start;
-	    start = 0;
-	    continue;
-	}
-    }
+        }
+    } while((nr_allocated += allocate_without_wrapping_disk(hint, 
+                          new_blocknrs + nr_allocated, start, finish,
+			  amount_needed - nr_allocated, hint->prealloc_size)) 
+			  < amount_needed) ;
     return CARRY_ON;
 }
 
--- 1.41/fs/reiserfs/inode.c	Mon Jan 20 05:19:30 2003
+++ edited/fs/reiserfs/inode.c	Thu Mar 13 10:17:09 2003
@@ -1485,6 +1485,8 @@
     struct cpu_key key;
     struct item_head ih;
     struct stat_data sd;
+    __u32 packing;
+    __u32 checked_packing = 0;
     int retval;
     int err ;
   
@@ -1503,7 +1505,8 @@
 	    inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
 
     /* item head of new item */
-    ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
+    packing = INODE_PKEY(dir)->k_dir_id;
+    ih.ih_key.k_dir_id = packing;
     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
     if (!ih.ih_key.k_objectid) {
 	err = -ENOMEM ;
@@ -1541,6 +1544,7 @@
     else
 	make_le_item_head (&ih, 0, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 
+packing_restart:
     /* key to search for correct place for new stat data */
     _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
 		   le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
@@ -1555,6 +1559,14 @@
 	pathrelse (&path_to_key);
 	err = -EEXIST;
 	goto out_bad_inode;
+    }
+    if (!checked_packing) {
+	checked_packing = choose_packing(dir, &path_to_key);
+	if (checked_packing != packing) {
+	    ih.ih_key.k_dir_id = checked_packing;
+	    pathrelse(&path_to_key);
+	    goto packing_restart;
+	}
     }
 
     if (old_format_only (sb)) {
--- 1.20/fs/reiserfs/stree.c	Fri Aug  9 11:22:33 2002
+++ edited/fs/reiserfs/stree.c	Thu Mar 13 10:16:06 2003
@@ -298,6 +298,47 @@
 /* Maximal possible key. It is never in the tree. */
 const struct key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}};
 
+static int packing_locality_span(struct super_block *sb, 
+                                 struct path *path, __u32 cpu_packing) 
+{
+    int path_offset;
+    int pos;
+    struct buffer_head *bh;
+    struct key *key;
+
+    path_offset = path->path_length;
+    pos = PATH_OFFSET_POSITION(path, path_offset); 
+    bh = PATH_OFFSET_PBUFFER(path, path_offset);
+
+    if (pos) {
+	/* check left */
+	key = B_N_PDELIM_KEY(bh, pos - 1);
+	if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
+	    return 0;
+	}
+    }
+    if (pos != B_NR_ITEMS(bh)) {
+	key = B_N_PDELIM_KEY(bh, pos + 1);
+	if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
+	    return 0;
+	}
+    }
+    return 1;
+}
+
+/* returns an le32 packing locality */
+__u32 choose_packing(struct inode *parent, struct path *path)
+{
+    __u32 grandparent;
+    int levels = path->span;
+
+    grandparent = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
+
+    if (levels <= (DISK_LEAF_NODE_LEVEL + 1)) {
+        return INODE_PKEY(parent)->k_dir_id;
+    }
+    return INODE_PKEY(parent)->k_objectid;
+}
 
 /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
    of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
@@ -757,6 +807,16 @@
 	/* ok, we have acquired next formatted node in the tree */
 	n_node_level = B_LEVEL (p_s_bh);
 
+	if (n_node_level != DISK_LEAF_NODE_LEVEL &&
+	    !packing_locality_span(p_s_sb, p_s_search_path, 
+	                           p_s_key->on_disk_key.k_dir_id)) 
+        {
+	    /* span is the deepest level in the tree where the same packing
+	     * locality wasn't found in the adjacent pointers
+	     */
+	    p_s_search_path->span = n_node_level;
+	}
+
 	PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 );
 
 	RFALSE( n_node_level < n_stop_level,
--- 1.27/fs/reiserfs/super.c	Wed Oct 30 11:42:36 2002
+++ edited/fs/reiserfs/super.c	Tue Mar 11 10:49:36 2003
@@ -1121,17 +1121,16 @@
     int old_magic;
     struct reiserfs_super_block * rs;
 
-
     memset (&s->u.reiserfs_sb, 0, sizeof (struct reiserfs_sb_info));
 
     /* Set default values for options: non-aggressive tails */
     s->u.reiserfs_sb.s_mount_opt = ( 1 << REISERFS_SMALLTAIL );
-    /* default block allocator option: skip_busy */
-    s->u.reiserfs_sb.s_alloc_options.bits = ( 1 << 5);
     /* If file grew past 4 blocks, start preallocation blocks for it. */
     s->u.reiserfs_sb.s_alloc_options.preallocmin = 4;
     /* Preallocate by 8 blocks (9-1) at once */
     s->u.reiserfs_sb.s_alloc_options.preallocsize = 9;
+    /* setup default block allocator options */
+    reiserfs_init_alloc_options(s);
 
     if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
 	return NULL;
===== include/linux/reiserfs_fs.h 1.26 vs edited =====
--- 1.26/include/linux/reiserfs_fs.h	Mon Jan 20 05:19:30 2003
+++ edited/include/linux/reiserfs_fs.h	Thu Mar 13 10:16:06 2003
@@ -1106,6 +1106,7 @@
   int                   path_length;                      	/* Length of the array above.   */
   struct  path_element  path_elements[EXTENDED_MAX_HEIGHT];	/* Array of the path elements.  */
   int			pos_in_item;
+  int			span;
 };
 
 #define pos_in_item(path) ((path)->pos_in_item)
@@ -1684,6 +1685,7 @@
 								const struct cpu_key * p_s_cpu_key, 
 								struct path * p_s_search_path);
 extern inline void decrement_bcount (struct buffer_head * p_s_bh);
+__u32 choose_packing(struct inode *, struct path *);
 void decrement_counters_in_path (struct path * p_s_search_path);
 void pathrelse (struct path * p_s_search_path);
 int reiserfs_check_path(struct path *p) ;
@@ -1953,6 +1955,7 @@
 typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
 
 int reiserfs_parse_alloc_options (struct super_block *, char *);
+void reiserfs_init_alloc_options (struct super_block *s);
 int is_reusable (struct super_block * s, unsigned long block, int bit_value);
 void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long);
 int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);







^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-13 15:59       ` Chris Mason
@ 2003-03-14  0:15         ` Hans Reiser
  2003-03-14  1:34           ` Chris Mason
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-14  0:15 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Wed, 2003-03-12 at 15:51, Hans Reiser wrote:
>  
>
>>Chris Mason wrote:
>>
>>    
>>
>>>[ dirid based grouping spreads things out too much on the disk ]
>>>
>>>The btree already has a metric for grouping, it's the packing locality. 
>>>I realized this morning that we just aren't being very smart about how
>>>we use it.  If one directory has 1 thousand subdirs with 1 file each,
>>>should they all be in different packing localities? 
>>>
>>>probably not.
>>>
>>>      
>>>
>>I would also like an API for letting users/apps choose packing 
>>localities, that way an rpm package can all go into the same packing 
>>locality even though semantically it scatters files across ten 
>>directories.  What do you think?
>>
>>    
>>
>More knobs for the admin are good, something like setattr
>INHERIT_PACKING dir
>
No, I don't like that one.

>
>  
>
>>>So I played around with the patch below, it tries to define a span over
>>>which a given packing locality used.  When new files or directories are
>>>created, their packing locality is chosen based on how much of the tree
>>>the packing locality of their parent directory is using.
>>>
>>>      
>>>
>>This is cool.  Good heuristic!
>>    
>>
>
>Ok, more testing showed that patch wasn't bad, but wasn't great either. 
>Under a multiprocess test, the buffers in the path would get moved away
>and there wasn't enough data to make a good decision.  So the patch
>below changes things around a little, and records the span during
>search_by_key instead of trying to compute it after the search is over.
>
Hunh?  Why don't you maintain a counter in the directory of the number 
of nodes in it?  Or are you afraid of causing extra IO?

>
>The end result is that if the keys in level 3 or higher of the tree have
>the same packing locality in adjacent keys, we use the dirid as the
>packing locality.  Otherwise we reuse the parent directory packing
>locality.
>
>On a 100MB subset of /usr/share/doc, this results in 4 packing
>localities for 737 directories.  In combination with the dirid grouping
>allocator, it does a nice job of preventing fragmentation in the
>multi-writer and deletion cases.
>
>The results are really interesting.  For reading a fresh copy of /usr,
>skip_busy alone needs 1m50s.  dirid_groups needs 2m40s and the packing
>locality code brings that down to 2m10s.  Not too bad against
>skip_busy's best case.
>
>When I use 10 procs to fill a directory tree, (~1GB of data total),
>skip_busy alone needs 3m7s to read it, and the external fragmentation is
>10%.
>
>If I run the same test with the new packing locality code +
>dirid_groups, it only takes 2m to read it and the fragmentation is 3%.
>
>In other words, I think this is a really good compromise between the
>current defaults and a more optimal case for fragmentation, and I expect
>its performance to hold up much better as the filesystem ages.
>
Let's get lots of different testers.  You may have a nice heuristic here 
though....

How big are your packing localities tending to be?

>
>The patch is below against 2.4.21-pre is below.  It should be considered
>extremely risky and is really only meant for review by those interested.
>
>(note, this changes the default mount alloc option to dirid_groups + 5%
>border + skip_busy)
>
>--- 1.18/fs/reiserfs/bitmap.c	Thu Sep 12 04:39:21 2002
>+++ edited/fs/reiserfs/bitmap.c	Thu Mar 13 10:18:37 2003
>@@ -33,6 +33,8 @@
> #define  _ALLOC_hashed_formatted_nodes 7
> #define  _ALLOC_old_way 8
> #define  _ALLOC_hundredth_slices 9
>+#define  _ALLOC_dirid_groups 10
>+#define  _ALLOC_oid_groups 11
> 
> #define  concentrating_formatted_nodes(s)     test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
> #define  displacing_large_files(s)            test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
>@@ -260,8 +262,18 @@
>     get_bit_address (s, *start, &bm, &off);
>     get_bit_address (s, finish, &end_bm, &end_off);
> 
>-    // With this option set first we try to find a bitmap that is at least 10%
>-    // free, and if that fails, then we fall back to old whole bitmap scanning
>+    /* When the bitmap is more than 10% free, anyone can allocate.
>+     * When it's less than 10% free, only files that already use the
>+     * bitmap are allowed. Once we pass 80% full, this restriction
>+     * is lifted.
>+     *
>+     * We do this so that files that grow later still have space close to
>+     * their original allocation. This improves locality, and presumably
>+     * performance as a result.
>+     *
>+     * This is only an allocation policy and does not make up for getting a
>+     * bad hint. Decent hinting must be implemented for this to work well.
>+     */
>     if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
> 	for (;bm < end_bm; bm++, off = 0) {
> 	    if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
>@@ -392,6 +404,16 @@
>   }
> }
> 
>+void reiserfs_init_alloc_options (struct super_block *s)
>+{
>+    set_bit (_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s));
>+    set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
>+    set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
>+    s->u.reiserfs_sb.s_alloc_options.border = 20;
>+
>+    reiserfs_warning ("allocator defaults = [%08x]\n", SB_ALLOC_OPTS(s));
>+}
>+
> /* block allocator related options are parsed here */
> int reiserfs_parse_alloc_options(struct super_block * s, char * options)
> {
>@@ -435,6 +457,15 @@
> 	    continue;
> 	}
> 
>+        if (!strcmp(this_char, "dirid_groups")) {
>+	    SET_OPTION(dirid_groups);
>+	    continue;
>+        }
>+        if (!strcmp(this_char, "oid_groups")) {
>+	    SET_OPTION(oid_groups);
>+	    continue;
>+        }
>+
> 	if (!strcmp(this_char, "hashed_formatted_nodes")) {
> 	    SET_OPTION(hashed_formatted_nodes);
> 	    continue;
>@@ -476,6 +507,7 @@
> 	return 1;
>     }
> 
>+    reiserfs_warning ("allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
>     return 0;
> }
> 
>@@ -498,17 +530,87 @@
>     hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
> }
> 
>-static void inline get_left_neighbor(reiserfs_blocknr_hint_t *hint)
>+/* 
>+ * Relocation based on dirid, hashing them into a given bitmap block
>+ * files. Formatted nodes are unaffected, a seperate policy covers them 
>+ */
>+static void 
>+dirid_groups (reiserfs_blocknr_hint_t *hint)
>+{
>+    if (hint->inode) {
>+        char * hash_in = NULL;
>+	unsigned long hash;
>+	unsigned long mask;
>+	__u32 dirid;
>+
>+	/* effectively turns the disk in 64MB groups (4k blocksize), 
>+	 * but the bigger your disk is the less likely hash collisions
>+	 * are, leading to dynamically bigger groups based on 
>+	 * your disk size
>+	 */
>+	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
>+	/* keep the root dir and it's first set of subdirs close to
>+	 * the start of the disk
>+	 */
>+	if (dirid <= 2) {
>+	    hash = 0;
>+	} else {
>+	    hash_in = (char *)(&dirid);
>+	    hash = keyed_hash(hash_in, 4);
>+	}
>+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
>+	hash = hint->beg + hash % (hint->end - hint->beg);
>+	hash &= ~mask;
>+
>+	hint->search_start = hash;
>+    }
>+}
>+
>+/* 
>+ * Relocation based on oid, hashing them into a given bitmap block
>+ * files. Formatted nodes are unaffected, a seperate policy covers them 
>+ */
>+static void 
>+oid_groups (reiserfs_blocknr_hint_t *hint)
>+{
>+    if (hint->inode) {
>+        char * hash_in = NULL;
>+	unsigned long hash;
>+	unsigned long mask;
>+	__u32 oid;
>+
>+	oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
>+	hash_in = (char *)(&oid);
>+
>+	/* effectively turns the disk in 64MB groups (4k blocksize), 
>+	 * but the bigger your disk is the less likely hash collisions
>+	 * are, leading to dynamically bigger groups based on 
>+	 * your disk size
>+	 */
>+	mask = (hint->inode->i_sb->s_blocksize << 2) - 1;
>+	hash = keyed_hash(hash_in, 4);
>+	hash = hint->beg + hash % (hint->end - hint->beg);
>+	hash &= ~mask;
>+
>+	hint->search_start = hash;
>+    }
>+}
>+
>+/* returns 1 if it finds an indirect item and gets valid hint info
>+ * from it, otherwise 0
>+ */
>+static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
> {
>     struct path * path;
>     struct buffer_head * bh;
>     struct item_head * ih;
>     int pos_in_item;
>     __u32 * item;
>+    int ret = 0;
> 
>     if (!hint->path)		/* reiserfs code can call this function w/o pointer to path
> 				 * structure supplied; then we rely on supplied search_start */
>-	return;
>+	return 0;
> 
>     path = hint->path;
>     bh = get_last_bh(path);
>@@ -529,6 +631,7 @@
> 	    int t=get_block_num(item,pos_in_item);
> 	    if (t) {
> 		hint->search_start = t;
>+		ret = 1;
> 		break;
> 	    }
> 	    pos_in_item --;
>@@ -537,7 +640,7 @@
>     }
> 
>     /* does result value fit into specified region? */
>-    return;
>+    return ret;
> }
> 
> /* should be, if formatted node, then try to put on first part of the device
>@@ -640,6 +743,7 @@
>     struct super_block *s = hint->th->t_super;
>     hint->beg = 0;
>     hint->end = SB_BLOCK_COUNT(s) - 1;
>+    int unfm_hint;
> 
>     /* This is former border algorithm. Now with tunable border offset */
>     if (concentrating_formatted_nodes(s))
>@@ -668,19 +772,14 @@
> 	return;
>     }
> 
>-    /* attempt to copy a feature from old block allocator code */
>-    if (TEST_OPTION(old_hashed_relocation, s) && !hint->formatted_node) {
>-	old_hashed_relocation(hint);
>-    }
>-
>     /* if none of our special cases is relevant, use the left neighbor in the
>        tree order of the new node we are allocating for */
>     if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
>-	hash_formatted_node(hint);
>+        hash_formatted_node(hint);
> 	return;
>-    } 
>+    }
> 
>-    get_left_neighbor(hint);
>+    unfm_hint = get_left_neighbor(hint);
> 
>     /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
>        new blocks are displaced based on directory ID. Also, if suggested search_start
>@@ -705,10 +804,29 @@
> 	return;
>     }
> 
>-    if (TEST_OPTION(old_hashed_relocation, s))
>+    /* old_hashed_relocation only works on unformatted */
>+    if (!unfm_hint && !hint->formatted_node &&
>+        TEST_OPTION(old_hashed_relocation, s))
>+    {
> 	old_hashed_relocation(hint);
>-    if (TEST_OPTION(new_hashed_relocation, s))
>+    }
>+    /* new_hashed_relocation works with both formatted/unformatted nodes */
>+    if ((!unfm_hint || hint->formatted_node) &&
>+        TEST_OPTION(new_hashed_relocation, s))
>+    {
> 	new_hashed_relocation(hint);
>+    }
>+    /* dirid grouping works only on unformatted nodes */
>+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
>+    {
>+        dirid_groups(hint);
>+    }
>+
>+    /* oid grouping works only on unformatted nodes */
>+    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
>+    {
>+        oid_groups(hint);
>+    }
>     return;
> }
> 
>@@ -772,28 +890,35 @@
>     struct super_block *s = hint->th->t_super;
>     b_blocknr_t start = hint->search_start;
>     b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
>-    int second_pass = 0;
>+    int passno = 0;
>     int nr_allocated = 0;
> 
>     determine_prealloc_size(hint);
>-    while((nr_allocated
>-	  += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish,
>-					  amount_needed - nr_allocated, hint->prealloc_size))
>-	  < amount_needed) {
>-
>-	/* not all blocks were successfully allocated yet*/
>-	if (second_pass) {	/* it was a second pass; we must free all blocks */
>+    do {
>+	switch (passno++) {
>+        case 0: /* Search from hint->search_start to end of disk */
>+            start = hint->search_start;
>+            finish = SB_BLOCK_COUNT(s) - 1;
>+            break;
>+        case 1: /* Search from hint->beg to hint->search_start */
>+            start = hint->beg;
>+            finish = hint->search_start;
>+            break;
>+        case 2: /* Last chance: Search from 0 to hint->beg */
>+            start = 0;
>+            finish = hint->beg;
>+	    break;
>+        default: /* We've tried searching everywhere, not enough space */
>+	    /* not all blocks were successfully allocated yet*/
> 	    while (nr_allocated --)
> 		reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]);
> 
> 	    return NO_DISK_SPACE;
>-	} else {		/* refine search parameters for next pass */
>-	    second_pass = 1;
>-	    finish = start;
>-	    start = 0;
>-	    continue;
>-	}
>-    }
>+        }
>+    } while((nr_allocated += allocate_without_wrapping_disk(hint, 
>+                          new_blocknrs + nr_allocated, start, finish,
>+			  amount_needed - nr_allocated, hint->prealloc_size)) 
>+			  < amount_needed) ;
>     return CARRY_ON;
> }
> 
>--- 1.41/fs/reiserfs/inode.c	Mon Jan 20 05:19:30 2003
>+++ edited/fs/reiserfs/inode.c	Thu Mar 13 10:17:09 2003
>@@ -1485,6 +1485,8 @@
>     struct cpu_key key;
>     struct item_head ih;
>     struct stat_data sd;
>+    __u32 packing;
>+    __u32 checked_packing = 0;
>     int retval;
>     int err ;
>   
>@@ -1503,7 +1505,8 @@
> 	    inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
> 
>     /* item head of new item */
>-    ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
>+    packing = INODE_PKEY(dir)->k_dir_id;
>+    ih.ih_key.k_dir_id = packing;
>     ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
>     if (!ih.ih_key.k_objectid) {
> 	err = -ENOMEM ;
>@@ -1541,6 +1544,7 @@
>     else
> 	make_le_item_head (&ih, 0, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
> 
>+packing_restart:
>     /* key to search for correct place for new stat data */
>     _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
> 		   le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
>@@ -1555,6 +1559,14 @@
> 	pathrelse (&path_to_key);
> 	err = -EEXIST;
> 	goto out_bad_inode;
>+    }
>+    if (!checked_packing) {
>+	checked_packing = choose_packing(dir, &path_to_key);
>+	if (checked_packing != packing) {
>+	    ih.ih_key.k_dir_id = checked_packing;
>+	    pathrelse(&path_to_key);
>+	    goto packing_restart;
>+	}
>     }
> 
>     if (old_format_only (sb)) {
>--- 1.20/fs/reiserfs/stree.c	Fri Aug  9 11:22:33 2002
>+++ edited/fs/reiserfs/stree.c	Thu Mar 13 10:16:06 2003
>@@ -298,6 +298,47 @@
> /* Maximal possible key. It is never in the tree. */
> const struct key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}};
> 
>+static int packing_locality_span(struct super_block *sb, 
>+                                 struct path *path, __u32 cpu_packing) 
>+{
>+    int path_offset;
>+    int pos;
>+    struct buffer_head *bh;
>+    struct key *key;
>+
>+    path_offset = path->path_length;
>+    pos = PATH_OFFSET_POSITION(path, path_offset); 
>+    bh = PATH_OFFSET_PBUFFER(path, path_offset);
>+
>+    if (pos) {
>+	/* check left */
>+	key = B_N_PDELIM_KEY(bh, pos - 1);
>+	if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
>+	    return 0;
>+	}
>+    }
>+    if (pos != B_NR_ITEMS(bh)) {
>+	key = B_N_PDELIM_KEY(bh, pos + 1);
>+	if (le32_to_cpu(key->k_dir_id) != cpu_packing) {
>+	    return 0;
>+	}
>+    }
>+    return 1;
>+}
>+
>+/* returns an le32 packing locality */
>+__u32 choose_packing(struct inode *parent, struct path *path)
>+{
>+    __u32 grandparent;
>+    int levels = path->span;
>+
>+    grandparent = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
>+
>+    if (levels <= (DISK_LEAF_NODE_LEVEL + 1)) {
>+        return INODE_PKEY(parent)->k_dir_id;
>+    }
>+    return INODE_PKEY(parent)->k_objectid;
>+}
> 
> /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
>    of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
>@@ -757,6 +807,16 @@
> 	/* ok, we have acquired next formatted node in the tree */
> 	n_node_level = B_LEVEL (p_s_bh);
> 
>+	if (n_node_level != DISK_LEAF_NODE_LEVEL &&
>+	    !packing_locality_span(p_s_sb, p_s_search_path, 
>+	                           p_s_key->on_disk_key.k_dir_id)) 
>+        {
>+	    /* span is the deepest level in the tree where the same packing
>+	     * locality wasn't found in the adjacent pointers
>+	     */
>+	    p_s_search_path->span = n_node_level;
>+	}
>+
> 	PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 );
> 
> 	RFALSE( n_node_level < n_stop_level,
>--- 1.27/fs/reiserfs/super.c	Wed Oct 30 11:42:36 2002
>+++ edited/fs/reiserfs/super.c	Tue Mar 11 10:49:36 2003
>@@ -1121,17 +1121,16 @@
>     int old_magic;
>     struct reiserfs_super_block * rs;
> 
>-
>     memset (&s->u.reiserfs_sb, 0, sizeof (struct reiserfs_sb_info));
> 
>     /* Set default values for options: non-aggressive tails */
>     s->u.reiserfs_sb.s_mount_opt = ( 1 << REISERFS_SMALLTAIL );
>-    /* default block allocator option: skip_busy */
>-    s->u.reiserfs_sb.s_alloc_options.bits = ( 1 << 5);
>     /* If file grew past 4 blocks, start preallocation blocks for it. */
>     s->u.reiserfs_sb.s_alloc_options.preallocmin = 4;
>     /* Preallocate by 8 blocks (9-1) at once */
>     s->u.reiserfs_sb.s_alloc_options.preallocsize = 9;
>+    /* setup default block allocator options */
>+    reiserfs_init_alloc_options(s);
> 
>     if (reiserfs_parse_options (s, (char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) {
> 	return NULL;
>===== include/linux/reiserfs_fs.h 1.26 vs edited =====
>--- 1.26/include/linux/reiserfs_fs.h	Mon Jan 20 05:19:30 2003
>+++ edited/include/linux/reiserfs_fs.h	Thu Mar 13 10:16:06 2003
>@@ -1106,6 +1106,7 @@
>   int                   path_length;                      	/* Length of the array above.   */
>   struct  path_element  path_elements[EXTENDED_MAX_HEIGHT];	/* Array of the path elements.  */
>   int			pos_in_item;
>+  int			span;
> };
> 
> #define pos_in_item(path) ((path)->pos_in_item)
>@@ -1684,6 +1685,7 @@
> 								const struct cpu_key * p_s_cpu_key, 
> 								struct path * p_s_search_path);
> extern inline void decrement_bcount (struct buffer_head * p_s_bh);
>+__u32 choose_packing(struct inode *, struct path *);
> void decrement_counters_in_path (struct path * p_s_search_path);
> void pathrelse (struct path * p_s_search_path);
> int reiserfs_check_path(struct path *p) ;
>@@ -1953,6 +1955,7 @@
> typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
> 
> int reiserfs_parse_alloc_options (struct super_block *, char *);
>+void reiserfs_init_alloc_options (struct super_block *s);
> int is_reusable (struct super_block * s, unsigned long block, int bit_value);
> void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long);
> int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
>
>
>
>
>
>
>
>
>  
>


-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14  0:15         ` Hans Reiser
@ 2003-03-14  1:34           ` Chris Mason
  2003-03-14 10:26             ` Hans Reiser
  2003-03-14 13:59             ` Manuel Krause
  0 siblings, 2 replies; 42+ messages in thread
From: Chris Mason @ 2003-03-14  1:34 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Thu, 2003-03-13 at 19:15, Hans Reiser wrote:

> >
> >Ok, more testing showed that patch wasn't bad, but wasn't great either. 
> >Under a multiprocess test, the buffers in the path would get moved away
> >and there wasn't enough data to make a good decision.  So the patch
> >below changes things around a little, and records the span during
> >search_by_key instead of trying to compute it after the search is over.
> >
> Hunh?  Why don't you maintain a counter in the directory of the number 
> of nodes in it?  Or are you afraid of causing extra IO?
> 

That would mean the parent directory counter would have to be updated
every time we allocated a block in any sub directory.  Plus the counter
would have to be inherited down the chain in deep directory structures. 
More importantly, I'd rather not waste space in the stat data to store
the information when we can get it during a search ;-)

Basing the span on the tree allows us to detect a highly used locality
regardless of what kind of tree object was using it, and we can do it
with almost zero overhead.

> >In other words, I think this is a really good compromise between the
> >current defaults and a more optimal case for fragmentation, and I expect
> >its performance to hold up much better as the filesystem ages.
> >
> Let's get lots of different testers.  You may have a nice heuristic here 
> though....
> 

If everyone agrees the approach is worth trying, I'll make a patch that
enables it via a mount option.

> How big are your packing localities tending to be?

Not more than can be pointed to by the leaf level and the level directly
above it.  I know that's not very specific, but it varies by the
dataset.  packed tails and long directory names lead to more packing
localities per MB.

My 100MB subset of /usr/share/doc has 732 directories in it, and the
copy phase of stress.sh -n 10 -s for that directory tree will produce
1400 packing localities, so it depends on size of the tree in general.

-chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14  1:34           ` Chris Mason
@ 2003-03-14 10:26             ` Hans Reiser
  2003-03-14 13:51               ` Chris Mason
  2003-03-14 13:59             ` Manuel Krause
  1 sibling, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-14 10:26 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Thu, 2003-03-13 at 19:15, Hans Reiser wrote:
>
>  
>
>>>Ok, more testing showed that patch wasn't bad, but wasn't great either. 
>>>Under a multiprocess test, the buffers in the path would get moved away
>>>and there wasn't enough data to make a good decision.  So the patch
>>>below changes things around a little, and records the span during
>>>search_by_key instead of trying to compute it after the search is over.
>>>
>>>      
>>>
>>Hunh?  Why don't you maintain a counter in the directory of the number 
>>of nodes in it?  Or are you afraid of causing extra IO?
>>
>>    
>>
>
>That would mean the parent directory counter would have to be updated
>every time we allocated a block in any sub directory.  Plus the counter
>would have to be inherited down the chain in deep directory structures. 
>More importantly, I'd rather not waste space in the stat data to store
>the information when we can get it during a search ;-)
>
The space usage is trivial.

>
>Basing the span on the tree allows us to detect a highly used locality
>regardless of what kind of tree object was using it, and we can do it
>with almost zero overhead.
>
>  
>
>>>In other words, I think this is a really good compromise between the
>>>current defaults and a more optimal case for fragmentation, and I expect
>>>its performance to hold up much better as the filesystem ages.
>>>
>>>      
>>>
>>Let's get lots of different testers.  You may have a nice heuristic here 
>>though....
>>
>>    
>>
>
>If everyone agrees the approach is worth trying, I'll make a patch that
>enables it via a mount option.
>
>  
>
>>How big are your packing localities tending to be?
>>    
>>
>
>Not more than can be pointed to by the leaf level and the level directly
>above it.  I know that's not very specific, but it varies by the
>dataset.  packed tails and long directory names lead to more packing
>localities per MB.
>
Which is why it is the wrong measure, yes?

>
>My 100MB subset of /usr/share/doc has 732 directories in it, and the
>copy phase of stress.sh -n 10 -s for that directory tree will produce
>1400 packing localities, so it depends on size of the tree in general.
>
>-chris
>
>
>
>
>  
>
I am not saying that storing it in the directory is the right thing, 
because of the risk of extra IO, but I think it should be tried.

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14 10:26             ` Hans Reiser
@ 2003-03-14 13:51               ` Chris Mason
  2003-03-14 18:59                 ` Hans Reiser
  0 siblings, 1 reply; 42+ messages in thread
From: Chris Mason @ 2003-03-14 13:51 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Fri, 2003-03-14 at 05:26, Hans Reiser wrote:

> >That would mean the parent directory counter would have to be updated
> >every time we allocated a block in any sub directory.  Plus the counter
> >would have to be inherited down the chain in deep directory structures. 
> >More importantly, I'd rather not waste space in the stat data to store
> >the information when we can get it during a search ;-)
> >
> The space usage is trivial.
> 

Grin, who are you and what have you done with the real hans ;-)  It's
two fields, one for the counter and one to point up the chain to the
real owner.  It's yet another field to maintain as objects are deleted
and created, a minor format change since old filesystem stat data won't
have the field, and requires support from fsck.

All of which is a lot of work when we can get similar info directly from
the tree.

> >
> >>How big are your packing localities tending to be?
> >
> >Not more than can be pointed to by the leaf level and the level directly
> >above it.  I know that's not very specific, but it varies by the
> >dataset.  packed tails and long directory names lead to more packing
> >localities per MB.
> >
> Which is why it is the wrong measure, yes?
> 

Well, yes and no.  The packing locality groups tree objects, and so the
idea behind the patch is to group all tree objects when they are part of
a directory tree that isn't very large.  A smart block allocator for the
tree nodes can use this information too.

In other words, my hope is this patch also makes btree searches more
efficient while walking a given directory tree, since we aren't jumping
all over the btree for each subdirectory.

-chris

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14 13:51               ` Chris Mason
@ 2003-03-14 18:59                 ` Hans Reiser
  2003-03-14 20:40                   ` Chris Mason
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-03-14 18:59 UTC (permalink / raw)
  To: Chris Mason; +Cc: Oleg Drokin, reiserfs-list

Chris Mason wrote:

>On Fri, 2003-03-14 at 05:26, Hans Reiser wrote:
>
>  
>
>>>That would mean the parent directory counter would have to be updated
>>>every time we allocated a block in any sub directory.  Plus the counter
>>>would have to be inherited down the chain in deep directory structures. 
>>>More importantly, I'd rather not waste space in the stat data to store
>>>the information when we can get it during a search ;-)
>>>
>>>      
>>>
>>The space usage is trivial.
>>
>>    
>>
>
>Grin, who are you and what have you done with the real hans ;-) 
>
You don't need it for every file, you need it for every directory.

> It's
>two fields, one for the counter and one to point up the chain to the
>real owner.  It's yet another field to maintain as objects are deleted
>and created, 
>
or written to or truncated, yes, the cost of lots of updates to this are 
worrying.    It might be better done in the repacker than dynamically, 
in fact I just convinced myself of that, how about you.....?

>a minor format change since old filesystem stat data won't
>have the field, and requires support from fsck.
>
Nobody will mind if we change reiser4 format now....

>
>All of which is a lot of work when we can get similar info directly from
>the tree.
>
>  
>
>>>>How big are your packing localities tending to be?
>>>>        
>>>>
>>>Not more than can be pointed to by the leaf level and the level directly
>>>above it.  I know that's not very specific, but it varies by the
>>>dataset.  packed tails and long directory names lead to more packing
>>>localities per MB.
>>>
>>>      
>>>
>>Which is why it is the wrong measure, yes?
>>
>>    
>>
>
>Well, yes and no.  The packing locality groups tree objects, and so the
>idea behind the patch is to group all tree objects when they are part of
>a directory tree that isn't very large.  A smart block allocator for the
>tree nodes can use this information too.
>
>In other words, my hope is this patch also makes btree searches more
>efficient while walking a given directory tree, since we aren't jumping
>all over the btree for each subdirectory.
>
>-chris
>
>
>
>
>  
>


-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14 18:59                 ` Hans Reiser
@ 2003-03-14 20:40                   ` Chris Mason
  0 siblings, 0 replies; 42+ messages in thread
From: Chris Mason @ 2003-03-14 20:40 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Oleg Drokin, reiserfs-list

On Fri, 2003-03-14 at 13:59, Hans Reiser wrote:

> >Grin, who are you and what have you done with the real hans ;-) 
> >
> You don't need it for every file, you need it for every directory.
> 

How does a given file know which directory to update, you'd need a
pointer in each file to the corresponding key of the directory where the
accounting is actually done.

> > It's
> >two fields, one for the counter and one to point up the chain to the
> >real owner.  It's yet another field to maintain as objects are deleted
> >and created, 
> >
> or written to or truncated, yes, the cost of lots of updates to this are 
> worrying.    It might be better done in the repacker than dynamically, 
> in fact I just convinced myself of that, how about you.....?
> 
> >a minor format change since old filesystem stat data won't
> >have the field, and requires support from fsck.
> >
> Nobody will mind if we change reiser4 format now....
> 

I'm also concerned with improving the block allocator for reiserfs v3,
there's no reason to keep the existing one if minor changes can make it
substantially better.

-chris



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14  1:34           ` Chris Mason
  2003-03-14 10:26             ` Hans Reiser
@ 2003-03-14 13:59             ` Manuel Krause
  2003-03-14 14:10               ` Chris Mason
  1 sibling, 1 reply; 42+ messages in thread
From: Manuel Krause @ 2003-03-14 13:59 UTC (permalink / raw)
  To: Chris Mason; +Cc: reiserfs-list

On 03/14/2003 02:34 AM, Chris Mason wrote:
> On Thu, 2003-03-13 at 19:15, Hans Reiser wrote:
> 

[ discussion on how to implement lower fragmentation on ReiserFS ]

>>
>>Let's get lots of different testers.  You may have a nice heuristic here 
>>though....
>>
> 
> 
> If everyone agrees the approach is worth trying, I'll make a patch that
> enables it via a mount option.
> 
[...]

A dumb question inbetween: How do we - possible testers, users - get 
information about fragmentation on our ReiserFS partitions?

Thanks,

Manuel


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-14 13:59             ` Manuel Krause
@ 2003-03-14 14:10               ` Chris Mason
  0 siblings, 0 replies; 42+ messages in thread
From: Chris Mason @ 2003-03-14 14:10 UTC (permalink / raw)
  To: Manuel Krause; +Cc: reiserfs-list

On Fri, 2003-03-14 at 08:59, Manuel Krause wrote:
> On 03/14/2003 02:34 AM, Chris Mason wrote:
> > On Thu, 2003-03-13 at 19:15, Hans Reiser wrote:
> > 
> 
> [ discussion on how to implement lower fragmentation on ReiserFS ]
> 
> >>
> >>Let's get lots of different testers.  You may have a nice heuristic here 
> >>though....
> >>
> > 
> > 
> > If everyone agrees the approach is worth trying, I'll make a patch that
> > enables it via a mount option.
> > 
> [...]
> 
> A dumb question inbetween: How do we - possible testers, users - get 
> information about fragmentation on our ReiserFS partitions?

The best tool I've seen so far originally came from Vladimir and was
modified for a study on fragmentation of reiserfs and ext2, Jeff found
the link somewhere in his archives:

http://www.informatik.uni-frankfurt.de/~loizides/reiserfs/index.html

There is also a filesystem aging tools there that I haven't played with
yet.

-chris



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-12 20:51     ` Hans Reiser
  2003-03-13 15:59       ` Chris Mason
@ 2003-03-16 16:25       ` Anders Widman
  2003-08-18 16:15         ` Hans Reiser
  1 sibling, 1 reply; 42+ messages in thread
From: Anders Widman @ 2003-03-16 16:25 UTC (permalink / raw)
  To: reiserfs-list

...
>defragmenting tools are nice and all, but before we can write one we
>actually have to pick a disk layout for the tool to create ;-)

   Yes  and no :). A simple packer that defragments files, and perhaps
   move  files  that  are  in the same directory together. This can be
   done   without  knowing  usage  statistics  or  need  any  special
   algorithms?

   //Anders


--------
PGP public key: https://tnonline.net/secure/pgp_key.txt


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-03-16 16:25       ` Anders Widman
@ 2003-08-18 16:15         ` Hans Reiser
  2003-08-18 16:20           ` Yury Umanets
  0 siblings, 1 reply; 42+ messages in thread
From: Hans Reiser @ 2003-08-18 16:15 UTC (permalink / raw)
  To: Anders Widman; +Cc: reiserfs-list

Anders Widman wrote:

>...
>  
>
>>defragmenting tools are nice and all, but before we can write one we
>>actually have to pick a disk layout for the tool to create ;-)
>>    
>>
>
>   Yes  and no :). A simple packer that defragments files, and perhaps
>   move  files  that  are  in the same directory together. This can be
>   done   without  knowing  usage  statistics  or  need  any  special
>   algorithms?
>
>   //Anders
>
>
>--------
>PGP public key: https://tnonline.net/secure/pgp_key.txt
>
>
>
>  
>
for reiser4 we simply put nodes in tree order

-- 
Hans



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] various allocator optimizations
  2003-08-18 16:15         ` Hans Reiser
@ 2003-08-18 16:20           ` Yury Umanets
  0 siblings, 0 replies; 42+ messages in thread
From: Yury Umanets @ 2003-08-18 16:20 UTC (permalink / raw)
  To: Hans Reiser; +Cc: Anders Widman, reiserfs-list

On Mon, 2003-08-18 at 20:15, Hans Reiser wrote:
> Anders Widman wrote:
> 
> >...
> >  
> >
> >>defragmenting tools are nice and all, but before we can write one we
> >>actually have to pick a disk layout for the tool to create ;-)
> >>    
> >>
> >
> >   Yes  and no :). A simple packer that defragments files, and perhaps
> >   move  files  that  are  in the same directory together. This can be
> >   done   without  knowing  usage  statistics  or  need  any  special
> >   algorithms?
> >
> >   //Anders
> >
> >
> >--------
> >PGP public key: https://tnonline.net/secure/pgp_key.txt

If you need this for reiser3, pay to Hans and we will implement it. Or
implement it by yourself using reiserfsprogs or progsreiserfs sources.

> >
> >
> >
> >  
> >
> for reiser4 we simply put nodes in tree order


^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2003-08-18 16:20 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-03-11 16:34 [PATCH] various allocator optimizations Chris Mason
2003-03-11 16:42 ` Oleg Drokin
2003-03-11 17:32   ` Chris Mason
2003-03-11 18:04     ` Oleg Drokin
2003-03-11 19:00       ` Chris Mason
2003-03-11 21:51         ` Hans Reiser
2003-03-11 21:42     ` Hans Reiser
2003-03-11 22:25       ` Chris Mason
2003-03-11 22:39         ` Anders Widman
2003-03-11 22:54           ` Hans Reiser
2003-03-11 23:19             ` Anders Widman
2003-03-12  7:15               ` Oleg Drokin
2003-03-11 22:46         ` Hans Reiser
2003-03-12  1:48           ` Chris Mason
2003-03-12  7:12             ` Oleg Drokin
2003-03-12 13:31               ` Chris Mason
2003-03-12 14:00                 ` Hans Reiser
2003-03-12 14:05                   ` Oleg Drokin
2003-03-12 14:08                     ` Hans Reiser
2003-03-12 14:17                       ` Oleg Drokin
2003-03-12 19:22                         ` Hans Reiser
2003-03-13  6:11                           ` Oleg Drokin
2003-03-13 12:06                             ` Hans Reiser
2003-03-13 12:10                               ` Oleg Drokin
2003-03-12 11:12             ` Hans Reiser
2003-03-12 13:35               ` Chris Mason
2003-03-12 14:03                 ` Hans Reiser
2003-03-12  7:14       ` Oleg Drokin
2003-03-12 19:57   ` Chris Mason
2003-03-12 20:51     ` Hans Reiser
2003-03-13 15:59       ` Chris Mason
2003-03-14  0:15         ` Hans Reiser
2003-03-14  1:34           ` Chris Mason
2003-03-14 10:26             ` Hans Reiser
2003-03-14 13:51               ` Chris Mason
2003-03-14 18:59                 ` Hans Reiser
2003-03-14 20:40                   ` Chris Mason
2003-03-14 13:59             ` Manuel Krause
2003-03-14 14:10               ` Chris Mason
2003-03-16 16:25       ` Anders Widman
2003-08-18 16:15         ` Hans Reiser
2003-08-18 16:20           ` Yury Umanets

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.