[Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
       [not found] <db5d593f-00ad-45ad-b300-c018d8588a44@zmail06.collab.prod.int.phx2.redhat.com>
@ 2011-10-04 16:39 ` Bob Peterson
  2011-10-05  9:09   ` Steven Whitehouse
  0 siblings, 1 reply; 12+ messages in thread
From: Bob Peterson @ 2011-10-04 16:39 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This patch adds read-ahead capability to GFS2's
directory hash table management.  It greatly improves
performance for some directory operations.  For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
--
 fs/gfs2/dir.c |   33 +++++++++++++++++++++++++++++++++
 1 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..9b4262e 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1376,6 +1376,37 @@ out:
 	return error;
 }
 
+static void gfs2_dir_readahead(struct inode *inode, __be64 *ht, unsigned hsize,
+			       u32 index)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count = 0;
+
+	while (index < hsize) {
+		last = blocknr;
+		blocknr = be64_to_cpu(ht[index++]);
+		if (blocknr == last)
+			continue;
+		count++;
+		if (count > 128)
+			break;
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
 
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1406,6 +1437,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 
+	gfs2_dir_readahead(inode, lp, hsize, index);
+
 	while (index < hsize) {
 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
 					   &copied, &depth,



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-04 16:39 ` [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal Bob Peterson
@ 2011-10-05  9:09   ` Steven Whitehouse
  2011-10-06 16:15     ` Bob Peterson
  0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2011-10-05  9:09 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

Were we not intending to make this turn itself off in cases where it
produces no benefit? I thought the plan was to track the state via the
file descriptor in order to avoid readingahead the same blocks over and
over again too,

Steve.

On Tue, 2011-10-04 at 12:39 -0400, Bob Peterson wrote:
> Hi,
> 
> This patch adds read-ahead capability to GFS2's
> directory hash table management.  It greatly improves
> performance for some directory operations.  For example:
> In one of my file systems that has 1000 directories, each
> of which has 1000 files, time to execute a recursive
> ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
> from 2m2.814s on a stock kernel to 0m45.938s.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
> --
>  fs/gfs2/dir.c |   33 +++++++++++++++++++++++++++++++++
>  1 files changed, 33 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..9b4262e 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -1376,6 +1376,37 @@ out:
>  	return error;
>  }
>  
> +static void gfs2_dir_readahead(struct inode *inode, __be64 *ht, unsigned hsize,
> +			       u32 index)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count = 0;
> +
> +	while (index < hsize) {
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ht[index++]);
> +		if (blocknr == last)
> +			continue;
> +		count++;
> +		if (count > 128)
> +			break;
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
>  
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
> @@ -1406,6 +1437,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  	if (IS_ERR(lp))
>  		return PTR_ERR(lp);
>  
> +	gfs2_dir_readahead(inode, lp, hsize, index);
> +
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
>  					   &copied, &depth,
> 




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-05  9:09   ` Steven Whitehouse
@ 2011-10-06 16:15     ` Bob Peterson
  2011-10-07 11:02       ` Steven Whitehouse
  0 siblings, 1 reply; 12+ messages in thread
From: Bob Peterson @ 2011-10-06 16:15 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Hi,
| 
| Were we not intending to make this turn itself off in cases where it
| produces no benefit? I thought the plan was to track the state via
| the
| file descriptor in order to avoid readingahead the same blocks over
| and
| over again too,
| 
| Steve.

Hi Steve,

Based on our discussion, I revised the patch to implement a
scheme whereby we don't duplicate read-ahead requests.  However,
I decided against using the file descriptor in favor of a
simple bitmap that keeps track of where we've done read-ahead.

So now instead of a hash table in the inode, I have a structure
that contains two pointers: (1) hash table as before, and
(2) the bitmap to keep track of read-ahead requests.

I'm very happy with the results.  I've got a file system with
500,000 files in a single directory.  On my RHEL5 system, when
I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I
get these times:

Stock pre-release 5.8:   0m34.481s
Previously posted patch: 0m18.811s
With this patch:         0m9.843s

which is more than three times faster, and nearly twice as
fast as the version I previously posted.  I haven't tried my
"1000x1000" test on it yet.

The upstream version of the patch is given below.
There is one noticeable difference between RHEL5 and upstream:
I changed kmalloc to vmalloc.  Not sure your feelings on that.
Also, I have a feeling you're going to ask me to remove the
#define MAX_RA_BLOCKS and hard-code it to 32.  What can I say?

Regards,

Bob Peterson
Red Hat File Systems
--
commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5
Author: Bob Peterson <rpeterso@redhat.com>
Date:   Thu Oct 6 10:34:05 2011 -0500

    GFS2: Add readahead to sequential directory traversal
    
    This patch adds read-ahead capability to GFS2's
    directory hash table management.  It greatly improves
    performance for some directory operations.  For example:
    In one of my file systems that has 1000 directories, each
    of which has 1000 files, time to execute a recursive
    ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
    from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..1c89ae0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -329,48 +329,58 @@ fail:
  * gfs2_dir_get_hash_table - Get pointer to the dir hash table
  * @ip: The inode in question
  *
- * Returns: The hash table or an error
+ * Returns: an error code
  */
 
-static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
+static int gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
 	int ret;
-	u32 hsize;
+	u32 hsize, rasize;
 	__be64 *hc;
+	u8 *ra;
 
 	BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));
 
-	hc = ip->i_hash_cache;
-	if (hc)
-		return hc;
+	if (ip->i_hash_cache.h_ht)
+		return 0;
 
 	hsize = 1 << ip->i_depth;
+	rasize = hsize / 8; /* bit array */
 	hsize *= sizeof(__be64);
 	if (hsize != i_size_read(&ip->i_inode)) {
 		gfs2_consist_inode(ip);
-		return ERR_PTR(-EIO);
+		return -EIO;
 	}
 
-	hc = kmalloc(hsize, GFP_NOFS);
-	ret = -ENOMEM;
+	hc = vmalloc(hsize);
 	if (hc == NULL)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
+
+	ra = vzalloc(rasize);
+	if (ra == NULL) {
+		vfree(hc);
+		return -ENOMEM;
+	}
 
 	ret = gfs2_dir_read_data(ip, hc, hsize);
 	if (ret < 0) {
-		kfree(hc);
-		return ERR_PTR(ret);
+		vfree(hc);
+		vfree(ra);
+		return ret;
 	}
 
 	spin_lock(&inode->i_lock);
-	if (ip->i_hash_cache)
-		kfree(hc);
-	else
-		ip->i_hash_cache = hc;
+	if (ip->i_hash_cache.h_ht) {
+		vfree(hc);
+		vfree(ra);
+	} else {
+		ip->i_hash_cache.h_ht = hc;
+		ip->i_hash_cache.h_ra = ra;
+	}
 	spin_unlock(&inode->i_lock);
 
-	return ip->i_hash_cache;
+	return 0;
 }
 
 /**
@@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
  */
 void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
-	__be64 *hc = ip->i_hash_cache;
-	ip->i_hash_cache = NULL;
-	kfree(hc);
+	__be64 *hc = ip->i_hash_cache.h_ht;
+	u8 *h_ra = ip->i_hash_cache.h_ra;
+	ip->i_hash_cache.h_ht = NULL;
+	ip->i_hash_cache.h_ra = NULL;
+	vfree(hc);
+	vfree(h_ra);
 }
 
 static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
  * Returns: 0 on success, error code otherwise
  */
 
-static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
-		       u64 *leaf_out)
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out)
 {
+	int error;
 	__be64 *hash;
 
-	hash = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(hash))
-		return PTR_ERR(hash);
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
+	hash = dip->i_hash_cache.h_ht;
 	*leaf_out = be64_to_cpu(*(hash + index));
 	return 0;
 }
@@ -1097,27 +1111,35 @@ fail_brelse:
 static int dir_double_exhash(struct gfs2_inode *dip)
 {
 	struct buffer_head *dibh;
-	u32 hsize;
+	u32 hsize, rasize;
 	u32 hsize_bytes;
-	__be64 *hc;
-	__be64 *hc2, *h;
+	__be64 *hc, *hc2, *h;
+	u8 *ra;
 	int x;
-	int error = 0;
+	int error;
 
 	hsize = 1 << dip->i_depth;
+	rasize = hsize / 8; /* bit array */
 	hsize_bytes = hsize * sizeof(__be64);
 
-	hc = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(hc))
-		return PTR_ERR(hc);
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
 
-	h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
-	if (!hc2)
+	hc = dip->i_hash_cache.h_ht;
+	h = hc2 = vmalloc(hsize_bytes * 2);
+	if (h == NULL)
 		return -ENOMEM;
 
+	ra = vzalloc(rasize);
+	if (ra == NULL) {
+		error = -ENOMEM;
+		goto out_vfree;
+	}
+
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (error)
-		goto out_kfree;
+		goto out_rafree;
 
 	for (x = 0; x < hsize; x++) {
 		*h++ = *hc;
@@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 		goto fail;
 
 	gfs2_dir_hash_inval(dip);
-	dip->i_hash_cache = hc2;
+	dip->i_hash_cache.h_ht = hc2;
+	dip->i_hash_cache.h_ra = ra;
 	dip->i_depth++;
 	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
@@ -1142,8 +1165,11 @@ fail:
 	i_size_write(&dip->i_inode, hsize_bytes);
 	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
-out_kfree:
-	kfree(hc2);
+
+out_rafree:
+	vfree(ra);
+out_vfree:
+	vfree(hc2);
 	return error;
 }
 
@@ -1376,6 +1402,53 @@ out:
 	return error;
 }
 
+#define MAX_RA_BLOCKS 32
+
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count = 0;
+	int ra_byte, ra_bit;
+
+	while (index < hsize) {
+		ra_byte = index / 8;
+		ra_bit = index % 8;
+		last = blocknr;
+		blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]);
+		if (blocknr == last)
+			continue;
+		count++;
+		if (count > MAX_RA_BLOCKS)
+			break;
+
+		/* figure out if we've already requested this block */
+		if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01)
+			continue;
+
+		ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit);
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
 
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 		      filldir_t filldir)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
-	u32 hsize, len = 0;
+	u32 hsize, len;
 	u32 hash, index;
 	__be64 *lp;
 	int copied = 0;
-	int error = 0;
+	int error;
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
 	hash = gfs2_dir_offset2hash(*offset);
 	index = hash >> (32 - dip->i_depth);
 
-	lp = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(lp))
-		return PTR_ERR(lp);
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
+
+	lp = dip->i_hash_cache.h_ht;
+	gfs2_dir_readahead(inode, hsize, index);
 
 	while (index < hsize) {
 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
@@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 	u32 index = 0, next_index;
 	__be64 *lp;
 	u64 leaf_no;
-	int error = 0, last;
+	int error, last;
 
 	hsize = 1 << dip->i_depth;
-
-	lp = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(lp))
-		return PTR_ERR(lp);
-
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
+	lp = dip->i_hash_cache.h_ht;
 	while (index < hsize) {
 		leaf_no = be64_to_cpu(lp[index]);
 		if (leaf_no) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37..731d763 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -271,6 +271,11 @@ enum {
 };
 
 
+struct gfs2_hash_table {
+	__be64 *h_ht; /* hash table data */
+	u8 *h_ra; /* array indicating whether hash table block was read */
+};
+
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
@@ -285,7 +290,7 @@ struct gfs2_inode {
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
 	struct list_head i_trunc_list;
-	__be64 *i_hash_cache;
+	struct gfs2_hash_table i_hash_cache;
 	u32 i_entries;
 	u32 i_diskflags;
 	u8 i_height;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 8a139ff..a834b88 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo)
 	init_rwsem(&ip->i_rw_mutex);
 	INIT_LIST_HEAD(&ip->i_trunc_list);
 	ip->i_alloc = NULL;
-	ip->i_hash_cache = NULL;
+	ip->i_hash_cache.h_ht = NULL;
+	ip->i_hash_cache.h_ra = NULL;
 }
 
 static void gfs2_init_glock_once(void *foo)



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-06 16:15     ` Bob Peterson
@ 2011-10-07 11:02       ` Steven Whitehouse
  2011-10-07 16:01         ` Bob Peterson
  0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2011-10-07 11:02 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Thu, 2011-10-06 at 12:15 -0400, Bob Peterson wrote:
> ----- Original Message -----
> | Hi,
> | 
> | Were we not intending to make this turn itself off in cases where it
> | produces no benefit? I thought the plan was to track the state via
> | the
> | file descriptor in order to avoid readingahead the same blocks over
> | and
> | over again too,
> | 
> | Steve.
> 
> Hi Steve,
> 
> Based on our discussion, I revised the patch to implement a
> scheme whereby we don't duplicate read-ahead requests.  However,
> I decided against using the file descriptor in favor of a
> simple bitmap that keeps track of where we've done read-ahead.
> 
> So now instead of a hash table in the inode, I have a structure
> that contains two pointers: (1) hash table as before, and
> (2) the bitmap to keep track of read-ahead requests.
> 
> I'm very happy with the results.  I've got a file system with
> 500,000 files in a single directory.  On my RHEL5 system, when
> I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I
> get these times:
> 
> Stock pre-release 5.8:   0m34.481s
> Previously posted patch: 0m18.811s
> With this patch:         0m9.843s
> 
Ok, thats going very much in the right direction

> which is more than three times faster, and nearly twice as
> fast as the version I previously posted.  I haven't tried my
> "1000x1000" test on it yet.
> 
> The upstream version of the patch is given below.
> There is one noticeable difference between RHEL5 and upstream:
> I changed kmalloc to vmalloc.  Not sure your feelings on that.

I don't want to use vmalloc in the fast path of any function. We've
already run into trouble on that before and Linus has said that we
should not use it in any performance critical paths.

There is no problem using kmalloc in upstream for larger alocations
because (a) there is no 128k limit any more and (b) the VM is much
better at keeping larger orders of pages usable under low memory
conditions.

Also, it is not a problem to return -ENOMEM if we really have to.

> Also, I have a feeling you're going to ask me to remove the
> #define MAX_RA_BLOCKS and hard-code it to 32.  What can I say?
> 
Using a #define for a constant like this is perfectly ok. It would be
better if it could be adjusted on the fly, since not all storage devices
will have the same characteristics as yours, but that is probably
another step beyond what is required in this patch.

> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> --
> commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5
> Author: Bob Peterson <rpeterso@redhat.com>
> Date:   Thu Oct 6 10:34:05 2011 -0500
> 
>     GFS2: Add readahead to sequential directory traversal
>     
>     This patch adds read-ahead capability to GFS2's
>     directory hash table management.  It greatly improves
>     performance for some directory operations.  For example:
>     In one of my file systems that has 1000 directories, each
>     of which has 1000 files, time to execute a recursive
>     ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
>     from 2m2.814s on a stock kernel to 0m45.938s.
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..1c89ae0 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -329,48 +329,58 @@ fail:
>   * gfs2_dir_get_hash_table - Get pointer to the dir hash table
>   * @ip: The inode in question
>   *
> - * Returns: The hash table or an error
> + * Returns: an error code
>   */
>  
> -static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
> +static int gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  {
>  	struct inode *inode = &ip->i_inode;
>  	int ret;
> -	u32 hsize;
> +	u32 hsize, rasize;
>  	__be64 *hc;
> +	u8 *ra;
>  
>  	BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));
>  
> -	hc = ip->i_hash_cache;
> -	if (hc)
> -		return hc;
> +	if (ip->i_hash_cache.h_ht)
> +		return 0;
>  
>  	hsize = 1 << ip->i_depth;
> +	rasize = hsize / 8; /* bit array */
>  	hsize *= sizeof(__be64);
>  	if (hsize != i_size_read(&ip->i_inode)) {
>  		gfs2_consist_inode(ip);
> -		return ERR_PTR(-EIO);
> +		return -EIO;
>  	}
>  
> -	hc = kmalloc(hsize, GFP_NOFS);
> -	ret = -ENOMEM;
> +	hc = vmalloc(hsize);
>  	if (hc == NULL)
> -		return ERR_PTR(-ENOMEM);
> +		return -ENOMEM;
> +
> +	ra = vzalloc(rasize);
> +	if (ra == NULL) {
> +		vfree(hc);
> +		return -ENOMEM;
> +	}
>  
>  	ret = gfs2_dir_read_data(ip, hc, hsize);
>  	if (ret < 0) {
> -		kfree(hc);
> -		return ERR_PTR(ret);
> +		vfree(hc);
> +		vfree(ra);
> +		return ret;
>  	}
>  
>  	spin_lock(&inode->i_lock);
> -	if (ip->i_hash_cache)
> -		kfree(hc);
> -	else
> -		ip->i_hash_cache = hc;
> +	if (ip->i_hash_cache.h_ht) {
> +		vfree(hc);
> +		vfree(ra);
> +	} else {
> +		ip->i_hash_cache.h_ht = hc;
> +		ip->i_hash_cache.h_ra = ra;
> +	}
>  	spin_unlock(&inode->i_lock);
>  
> -	return ip->i_hash_cache;
> +	return 0;
>  }
Why not make the bitmap and the hash table part of the same allocation?
That way we only need a single pointer in the inode, and we halve the
number of allocations required.

What is the lifetime of the bitmap though? I'm not sure that I like the
idea of making it an inode specific datastructure, but see later...

>  
>  /**
> @@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>   */
>  void gfs2_dir_hash_inval(struct gfs2_inode *ip)
>  {
> -	__be64 *hc = ip->i_hash_cache;
> -	ip->i_hash_cache = NULL;
> -	kfree(hc);
> +	__be64 *hc = ip->i_hash_cache.h_ht;
> +	u8 *h_ra = ip->i_hash_cache.h_ra;
> +	ip->i_hash_cache.h_ht = NULL;
> +	ip->i_hash_cache.h_ra = NULL;
> +	vfree(hc);
> +	vfree(h_ra);
>  }
This then wouldn't need to be changed.

>  
>  static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
> @@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
>   * Returns: 0 on success, error code otherwise
>   */
>  
> -static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
> -		       u64 *leaf_out)
> +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out)
>  {
> +	int error;
>  	__be64 *hash;
>  
> -	hash = gfs2_dir_get_hash_table(dip);
> -	if (IS_ERR(hash))
> -		return PTR_ERR(hash);
> +	error = gfs2_dir_get_hash_table(dip);
> +	if (error)
> +		return error;
> +	hash = dip->i_hash_cache.h_ht;
>  	*leaf_out = be64_to_cpu(*(hash + index));
>  	return 0;
>  }
> @@ -1097,27 +1111,35 @@ fail_brelse:
>  static int dir_double_exhash(struct gfs2_inode *dip)
>  {
>  	struct buffer_head *dibh;
> -	u32 hsize;
> +	u32 hsize, rasize;
>  	u32 hsize_bytes;
> -	__be64 *hc;
> -	__be64 *hc2, *h;
> +	__be64 *hc, *hc2, *h;
> +	u8 *ra;
>  	int x;
> -	int error = 0;
> +	int error;
>  
>  	hsize = 1 << dip->i_depth;
> +	rasize = hsize / 8; /* bit array */
>  	hsize_bytes = hsize * sizeof(__be64);
>  
> -	hc = gfs2_dir_get_hash_table(dip);
> -	if (IS_ERR(hc))
> -		return PTR_ERR(hc);
> +	error = gfs2_dir_get_hash_table(dip);
> +	if (error)
> +		return error;
>  
> -	h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
> -	if (!hc2)
> +	hc = dip->i_hash_cache.h_ht;
> +	h = hc2 = vmalloc(hsize_bytes * 2);
> +	if (h == NULL)
>  		return -ENOMEM;
>  
> +	ra = vzalloc(rasize);
> +	if (ra == NULL) {
> +		error = -ENOMEM;
> +		goto out_vfree;
> +	}
> +
>  	error = gfs2_meta_inode_buffer(dip, &dibh);
>  	if (error)
> -		goto out_kfree;
> +		goto out_rafree;
>  
>  	for (x = 0; x < hsize; x++) {
>  		*h++ = *hc;
> @@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip)
>  		goto fail;
>  
>  	gfs2_dir_hash_inval(dip);
> -	dip->i_hash_cache = hc2;
> +	dip->i_hash_cache.h_ht = hc2;
> +	dip->i_hash_cache.h_ra = ra;
>  	dip->i_depth++;
>  	gfs2_dinode_out(dip, dibh->b_data);
>  	brelse(dibh);
> @@ -1142,8 +1165,11 @@ fail:
>  	i_size_write(&dip->i_inode, hsize_bytes);
>  	gfs2_dinode_out(dip, dibh->b_data);
>  	brelse(dibh);
> -out_kfree:
> -	kfree(hc2);
> +
> +out_rafree:
> +	vfree(ra);
> +out_vfree:
> +	vfree(hc2);
>  	return error;
>  }
>  
> @@ -1376,6 +1402,53 @@ out:
>  	return error;
>  }
>  
> +#define MAX_RA_BLOCKS 32
> +
> +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
> + *
> + * Note: we can't calculate each index like dir_e_read can because we don't
> + * have the leaf, and therefore we don't have the depth, and therefore we
> + * don't have the length. So we have to just read enough ahead to make up
> + * for the loss of information. */
> +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count = 0;
> +	int ra_byte, ra_bit;
> +
> +	while (index < hsize) {
> +		ra_byte = index / 8;
> +		ra_bit = index % 8;
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]);
> +		if (blocknr == last)
> +			continue;
We don't need to calculate ra_byte and ra_bit in the loop here, since
they are not used until later on...

> +		count++;
> +		if (count > MAX_RA_BLOCKS)
> +			break;
> +
> +		/* figure out if we've already requested this block */
> +		if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01)
> +			continue;
If there are multiple readdirs running at the same time (bear in mind
that we are under a read lock here) then this has no locking. I know
that we are only after a hint, so the cost of it being wrong is not
high, but why not use test_and_set_bit() at least?

The other question I have is where you reset the readahead bitmap? Do we
really have to wait for the inode to be pushed out of cache in order to
drop the dir readahead state? If so then I don't think this will cover
all the cases which we want it to.

I think we would be much better off storing the readahead state in the
struct file where it ought to be, so that then every caller will use
state that is personal to that file descriptor rather than a global
quantity,

Steve.





> +
> +		ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit);
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
>  
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
> @@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  		      filldir_t filldir)
>  {
>  	struct gfs2_inode *dip = GFS2_I(inode);
> -	u32 hsize, len = 0;
> +	u32 hsize, len;
>  	u32 hash, index;
>  	__be64 *lp;
>  	int copied = 0;
> -	int error = 0;
> +	int error;
>  	unsigned depth = 0;
>  
>  	hsize = 1 << dip->i_depth;
>  	hash = gfs2_dir_offset2hash(*offset);
>  	index = hash >> (32 - dip->i_depth);
>  
> -	lp = gfs2_dir_get_hash_table(dip);
> -	if (IS_ERR(lp))
> -		return PTR_ERR(lp);
> +	error = gfs2_dir_get_hash_table(dip);
> +	if (error)
> +		return error;
> +
> +	lp = dip->i_hash_cache.h_ht;
> +	gfs2_dir_readahead(inode, hsize, index);
>  
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
> @@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
>  	u32 index = 0, next_index;
>  	__be64 *lp;
>  	u64 leaf_no;
> -	int error = 0, last;
> +	int error, last;
>  
>  	hsize = 1 << dip->i_depth;
> -
> -	lp = gfs2_dir_get_hash_table(dip);
> -	if (IS_ERR(lp))
> -		return PTR_ERR(lp);
> -
> +	error = gfs2_dir_get_hash_table(dip);
> +	if (error)
> +		return error;
> +	lp = dip->i_hash_cache.h_ht;
>  	while (index < hsize) {
>  		leaf_no = be64_to_cpu(lp[index]);
>  		if (leaf_no) {
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 892ac37..731d763 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -271,6 +271,11 @@ enum {
>  };
>  
> 
> +struct gfs2_hash_table {
> +	__be64 *h_ht; /* hash table data */
> +	u8 *h_ra; /* array indicating whether hash table block was read */
> +};
> +
>  struct gfs2_inode {
>  	struct inode i_inode;
>  	u64 i_no_addr;
> @@ -285,7 +290,7 @@ struct gfs2_inode {
>  	u64 i_goal;	/* goal block for allocations */
>  	struct rw_semaphore i_rw_mutex;
>  	struct list_head i_trunc_list;
> -	__be64 *i_hash_cache;
> +	struct gfs2_hash_table i_hash_cache;
>  	u32 i_entries;
>  	u32 i_diskflags;
>  	u8 i_height;
> diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
> index 8a139ff..a834b88 100644
> --- a/fs/gfs2/main.c
> +++ b/fs/gfs2/main.c
> @@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo)
>  	init_rwsem(&ip->i_rw_mutex);
>  	INIT_LIST_HEAD(&ip->i_trunc_list);
>  	ip->i_alloc = NULL;
> -	ip->i_hash_cache = NULL;
> +	ip->i_hash_cache.h_ht = NULL;
> +	ip->i_hash_cache.h_ra = NULL;
>  }
>  
>  static void gfs2_init_glock_once(void *foo)




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-07 11:02       ` Steven Whitehouse
@ 2011-10-07 16:01         ` Bob Peterson
  2011-10-08 11:13           ` Steven Whitehouse
  2011-10-10  8:49           ` Steven Whitehouse
  0 siblings, 2 replies; 12+ messages in thread
From: Bob Peterson @ 2011-10-07 16:01 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

Thanks for the comments, Steve.  Here is another version
that shows the same performance benefit (better, actually).
It's much simpler than the previous one.  Instead of keeping
a bitmap, it simply uses a u32 in the gfs2_inode to keep
track of where it's last read-ahead.  That avoids a lot of
the issues you wrote about.

I couldn't use the file struct because of the way function
gfs2_dir_read is called from NFS.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
--
 fs/gfs2/dir.c    |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/incore.h |    1 +
 2 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..31888bd 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
 #define IS_LEAF     1 /* Hashed (leaf) directory */
 #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
 
+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
 
@@ -345,6 +347,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 	if (hc)
 		return hc;
 
+	ip->i_ra_index = 0;
 	hsize = 1 << ip->i_depth;
 	hsize *= sizeof(__be64);
 	if (hsize != i_size_read(&ip->i_inode)) {
@@ -382,6 +385,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
 	__be64 *hc = ip->i_hash_cache;
+	ip->i_ra_index = 0;
 	ip->i_hash_cache = NULL;
 	kfree(hc);
 }
@@ -1377,6 +1381,50 @@ out:
 }
 
 
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count;
+
+	/* First check if we've already read-ahead for the whole range. */
+	if (index + MAX_RA_BLOCKS < ip->i_ra_index)
+		return;
+
+	ip->i_ra_index = max(index, ip->i_ra_index);
+	for (count = 0; count < MAX_RA_BLOCKS; count++) {
+		if (ip->i_ra_index >= hsize) /* if exceeded the hash table */
+			break;
+
+		last = blocknr;
+		blocknr = be64_to_cpu(ip->i_hash_cache[ip->i_ra_index]);
+		ip->i_ra_index++;
+		if (blocknr == last)
+			continue;
+
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
+
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
  * @dip: dinode pointer
@@ -1406,6 +1454,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 
+	gfs2_dir_readahead(inode, hsize, index);
+
 	while (index < hsize) {
 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
 					   &copied, &depth,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37..50c3bcb 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -286,6 +286,7 @@ struct gfs2_inode {
 	struct rw_semaphore i_rw_mutex;
 	struct list_head i_trunc_list;
 	__be64 *i_hash_cache;
+	u32 i_ra_index; /* read-ahead index */
 	u32 i_entries;
 	u32 i_diskflags;
 	u8 i_height;



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-07 16:01         ` Bob Peterson
@ 2011-10-08 11:13           ` Steven Whitehouse
  2011-10-10  8:49           ` Steven Whitehouse
  1 sibling, 0 replies; 12+ messages in thread
From: Steven Whitehouse @ 2011-10-08 11:13 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Fri, 2011-10-07 at 12:01 -0400, Bob Peterson wrote:
> Hi,
> 
> Thanks for the comments, Steve.  Here is another version
> that shows the same performance benefit (better, actually).
> It's much simpler than the previous one.  Instead of keeping
> a bitmap, it simply uses a u32 in the gfs2_inode to keep
> track of where it's last read-ahead.  That avoids a lot of
> the issues you wrote about.
> 
> I couldn't use the file struct because of the way function
> gfs2_dir_read is called from NFS.
> 
Thanks for fixing that up... it looks much simpler now. Its a pity about
the NFS issue. We still need to figure out how to correctly reset the
readahead index correctly though, but I think we can leave that for a
future patch. One possible solution would be to reset it on
lseek(SEEK_SET, 0) for example, or if the readahead index is miles away
from the actual index,

Steve.

> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
> --
>  fs/gfs2/dir.c    |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/gfs2/incore.h |    1 +
>  2 files changed, 51 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..31888bd 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -76,6 +76,8 @@
>  #define IS_LEAF     1 /* Hashed (leaf) directory */
>  #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
>  
> +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
> +
>  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
>  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
>  
> @@ -345,6 +347,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  	if (hc)
>  		return hc;
>  
> +	ip->i_ra_index = 0;
>  	hsize = 1 << ip->i_depth;
>  	hsize *= sizeof(__be64);
>  	if (hsize != i_size_read(&ip->i_inode)) {
> @@ -382,6 +385,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  void gfs2_dir_hash_inval(struct gfs2_inode *ip)
>  {
>  	__be64 *hc = ip->i_hash_cache;
> +	ip->i_ra_index = 0;
>  	ip->i_hash_cache = NULL;
>  	kfree(hc);
>  }
> @@ -1377,6 +1381,50 @@ out:
>  }
>  
> 
> +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
> + *
> + * Note: we can't calculate each index like dir_e_read can because we don't
> + * have the leaf, and therefore we don't have the depth, and therefore we
> + * don't have the length. So we have to just read enough ahead to make up
> + * for the loss of information. */
> +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count;
> +
> +	/* First check if we've already read-ahead for the whole range. */
> +	if (index + MAX_RA_BLOCKS < ip->i_ra_index)
> +		return;
> +
> +	ip->i_ra_index = max(index, ip->i_ra_index);
> +	for (count = 0; count < MAX_RA_BLOCKS; count++) {
> +		if (ip->i_ra_index >= hsize) /* if exceeded the hash table */
> +			break;
> +
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ip->i_hash_cache[ip->i_ra_index]);
> +		ip->i_ra_index++;
> +		if (blocknr == last)
> +			continue;
> +
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
> +
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
>   * @dip: dinode pointer
> @@ -1406,6 +1454,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  	if (IS_ERR(lp))
>  		return PTR_ERR(lp);
>  
> +	gfs2_dir_readahead(inode, hsize, index);
> +
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
>  					   &copied, &depth,
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 892ac37..50c3bcb 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -286,6 +286,7 @@ struct gfs2_inode {
>  	struct rw_semaphore i_rw_mutex;
>  	struct list_head i_trunc_list;
>  	__be64 *i_hash_cache;
> +	u32 i_ra_index; /* read-ahead index */
>  	u32 i_entries;
>  	u32 i_diskflags;
>  	u8 i_height;




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-07 16:01         ` Bob Peterson
  2011-10-08 11:13           ` Steven Whitehouse
@ 2011-10-10  8:49           ` Steven Whitehouse
  2011-10-21 16:53             ` Bob Peterson
  1 sibling, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2011-10-10  8:49 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Fri, 2011-10-07 at 12:01 -0400, Bob Peterson wrote:
> Hi,
> 
> Thanks for the comments, Steve.  Here is another version
> that shows the same performance benefit (better, actually).
> It's much simpler than the previous one.  Instead of keeping
> a bitmap, it simply uses a u32 in the gfs2_inode to keep
> track of where it's last read-ahead.  That avoids a lot of
> the issues you wrote about.
> 
> I couldn't use the file struct because of the way function
> gfs2_dir_read is called from NFS.
> 
Well it is not called directly from NFS though, only from our getname
function. I'd suggest we do something along the following lines:

Create a new structure, something like this:

struct gfs2_dir_ra_state {
	/* Readahead state variables... */
};

and then pass that to gfs2_dir_read() in addition to passing the offset.
The getname call is trivial, since it always starts at the beginning and
reads through sequentially, so it just needs a gfs2_dir_ra_state to be
set up to take account of that. The struct gfs2_dir_ra_state can be
embedded into the struct gfs2_file, so that gfs2_readdir() just passes a
pointer to that.

That will allow us to reset the readahead state upon lseek(SEEK_SET, 0);
and also flag whether the accesses become non-sequential.

So I think keeping the state in the struct file and passing it to our
gfs2_file_read() function should not be too tricky,

Steve.

> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com> 
> --
>  fs/gfs2/dir.c    |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/gfs2/incore.h |    1 +
>  2 files changed, 51 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..31888bd 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -76,6 +76,8 @@
>  #define IS_LEAF     1 /* Hashed (leaf) directory */
>  #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
>  
> +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
> +
>  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
>  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
>  
> @@ -345,6 +347,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  	if (hc)
>  		return hc;
>  
> +	ip->i_ra_index = 0;
>  	hsize = 1 << ip->i_depth;
>  	hsize *= sizeof(__be64);
>  	if (hsize != i_size_read(&ip->i_inode)) {
> @@ -382,6 +385,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  void gfs2_dir_hash_inval(struct gfs2_inode *ip)
>  {
>  	__be64 *hc = ip->i_hash_cache;
> +	ip->i_ra_index = 0;
>  	ip->i_hash_cache = NULL;
>  	kfree(hc);
>  }
> @@ -1377,6 +1381,50 @@ out:
>  }
>  
> 
> +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
> + *
> + * Note: we can't calculate each index like dir_e_read can because we don't
> + * have the leaf, and therefore we don't have the depth, and therefore we
> + * don't have the length. So we have to just read enough ahead to make up
> + * for the loss of information. */
> +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count;
> +
> +	/* First check if we've already read-ahead for the whole range. */
> +	if (index + MAX_RA_BLOCKS < ip->i_ra_index)
> +		return;
> +
> +	ip->i_ra_index = max(index, ip->i_ra_index);
> +	for (count = 0; count < MAX_RA_BLOCKS; count++) {
> +		if (ip->i_ra_index >= hsize) /* if exceeded the hash table */
> +			break;
> +
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ip->i_hash_cache[ip->i_ra_index]);
> +		ip->i_ra_index++;
> +		if (blocknr == last)
> +			continue;
> +
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
> +
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
>   * @dip: dinode pointer
> @@ -1406,6 +1454,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  	if (IS_ERR(lp))
>  		return PTR_ERR(lp);
>  
> +	gfs2_dir_readahead(inode, hsize, index);
> +
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
>  					   &copied, &depth,
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 892ac37..50c3bcb 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -286,6 +286,7 @@ struct gfs2_inode {
>  	struct rw_semaphore i_rw_mutex;
>  	struct list_head i_trunc_list;
>  	__be64 *i_hash_cache;
> +	u32 i_ra_index; /* read-ahead index */
>  	u32 i_entries;
>  	u32 i_diskflags;
>  	u8 i_height;




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-10  8:49           ` Steven Whitehouse
@ 2011-10-21 16:53             ` Bob Peterson
  2011-10-24  8:17               ` Steven Whitehouse
  0 siblings, 1 reply; 12+ messages in thread
From: Bob Peterson @ 2011-10-21 16:53 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Well it is not called directly from NFS though, only from our getname
| function. I'd suggest we do something along the following lines:
| 
| Create a new structure, something like this:
| 
| struct gfs2_dir_ra_state {
| 	/* Readahead state variables... */
| };
| 
| and then pass that to gfs2_dir_read() in addition to passing the
| offset.
| The getname call is trivial, since it always starts at the beginning
| and
| reads through sequentially, so it just needs a gfs2_dir_ra_state to
| be
| set up to take account of that. The struct gfs2_dir_ra_state can be
| embedded into the struct gfs2_file, so that gfs2_readdir() just
| passes a
| pointer to that.
| 
| That will allow us to reset the readahead state upon lseek(SEEK_SET,
| 0);
| and also flag whether the accesses become non-sequential.
| 
| So I think keeping the state in the struct file and passing it to our
| gfs2_file_read() function should not be too tricky,
| 
| Steve.

Hi Steve,

Thanks for the feedback.  How about something like this (follows):
Instead of creating a new read-ahead structure gfs2_dir_ra_state
I'm making (non-standard) use of vfs's read-ahead struct in the
file.

Regards,

Bob Peterson
Red Hat File System
--
commit 14e64999c89c7e2801d69b9e877f2fa0bf10ad70
Author: Bob Peterson <rpeterso@redhat.com>
Date:   Fri Oct 7 11:55:31 2011 -0500

    GFS2: Add readahead to sequential directory traversal
    
    This patch adds read-ahead capability to GFS2's
    directory hash table management.  It greatly improves
    performance for some directory operations.  For example:
    In one of my file systems that has 1000 directories, each
    of which has 1000 files, time to execute a recursive
    ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
    from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..30405d4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
 #define IS_LEAF     1 /* Hashed (leaf) directory */
 #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
 
+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
 
@@ -1376,6 +1378,50 @@ out:
 	return error;
 }
 
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+			       struct file_ra_state *f_ra)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count;
+
+	/* First check if we've already read-ahead for the whole range. */
+	if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
+		return;
+
+	f_ra->start = max((pgoff_t)index, f_ra->start);
+	for (count = 0; count < MAX_RA_BLOCKS; count++) {
+		if (f_ra->start >= hsize) /* if exceeded the hash table */
+			break;
+
+		last = blocknr;
+		blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+		f_ra->start++;
+		if (blocknr == last)
+			continue;
+
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
 
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1388,7 +1434,7 @@ out:
  */
 
 static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
-		      filldir_t filldir)
+		      filldir_t filldir, struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	u32 hsize, len = 0;
@@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	hash = gfs2_dir_offset2hash(*offset);
 	index = hash >> (32 - dip->i_depth);
 
+	if (f_ra && dip->i_hash_cache == NULL)
+		f_ra->start = 0;
 	lp = gfs2_dir_get_hash_table(dip);
 	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 
+	gfs2_dir_readahead(inode, hsize, index, f_ra);
+
 	while (index < hsize) {
 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
 					   &copied, &depth,
@@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 }
 
 int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir)
+		  filldir_t filldir, struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		return 0;
 
 	if (dip->i_diskflags & GFS2_DIF_EXHASH)
-		return dir_e_read(inode, offset, opaque, filldir);
+		return dir_e_read(inode, offset, opaque, filldir, f_ra);
 
 	if (!gfs2_is_stuffed(dip)) {
 		gfs2_consist_inode(dip);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ff5772f..98c960b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
 			const struct gfs2_inode *ip);
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-			 filldir_t filldir);
+			 filldir_t filldir, struct file_ra_state *f_ra);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 			  const struct gfs2_inode *nip, unsigned int new_type);
 
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index fe9945f..a581cd2 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -118,7 +118,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	if (error)
 		return error;
 
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);
 
 	gfs2_glock_dq_uninit(&gh);
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 92c3db4..c9cbb5f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -96,7 +96,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
 		return error;
 	}
 
-	error = gfs2_dir_read(dir, &offset, dirent, filldir);
+	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
 
 	gfs2_glock_dq_uninit(&d_gh);
 



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-21 16:53             ` Bob Peterson
@ 2011-10-24  8:17               ` Steven Whitehouse
  2011-10-24 19:56                 ` Bob Peterson
  0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2011-10-24  8:17 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

That looks much better, but why not create & pass a file_ra_state in the
NFS getname function so that you don't have to check for it being NULL?
Since that function always reads everything up until the name it is
looking for, it is an ideal place to do readahead as the access will
always be sequential.

Otherwise, it looks good to me,

Steve.

On Fri, 2011-10-21 at 12:53 -0400, Bob Peterson wrote:
> ----- Original Message -----
> | Well it is not called directly from NFS though, only from our getname
> | function. I'd suggest we do something along the following lines:
> | 
> | Create a new structure, something like this:
> | 
> | struct gfs2_dir_ra_state {
> | 	/* Readahead state variables... */
> | };
> | 
> | and then pass that to gfs2_dir_read() in addition to passing the
> | offset.
> | The getname call is trivial, since it always starts at the beginning
> | and
> | reads through sequentially, so it just needs a gfs2_dir_ra_state to
> | be
> | set up to take account of that. The struct gfs2_dir_ra_state can be
> | embedded into the struct gfs2_file, so that gfs2_readdir() just
> | passes a
> | pointer to that.
> | 
> | That will allow us to reset the readahead state upon lseek(SEEK_SET,
> | 0);
> | and also flag whether the accesses become non-sequential.
> | 
> | So I think keeping the state in the struct file and passing it to our
> | gfs2_file_read() function should not be too tricky,
> | 
> | Steve.
> 
> Hi Steve,
> 
> Thanks for the feedback.  How about something like this (follows):
> Instead of creating a new read-ahead structure gfs2_dir_ra_state
> I'm making (non-standard) use of vfs's read-ahead struct in the
> file.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File System
> --
> commit 14e64999c89c7e2801d69b9e877f2fa0bf10ad70
> Author: Bob Peterson <rpeterso@redhat.com>
> Date:   Fri Oct 7 11:55:31 2011 -0500
> 
>     GFS2: Add readahead to sequential directory traversal
>     
>     This patch adds read-ahead capability to GFS2's
>     directory hash table management.  It greatly improves
>     performance for some directory operations.  For example:
>     In one of my file systems that has 1000 directories, each
>     of which has 1000 files, time to execute a recursive
>     ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
>     from 2m2.814s on a stock kernel to 0m45.938s.
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..30405d4 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -76,6 +76,8 @@
>  #define IS_LEAF     1 /* Hashed (leaf) directory */
>  #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
>  
> +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
> +
>  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
>  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
>  
> @@ -1376,6 +1378,50 @@ out:
>  	return error;
>  }
>  
> +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
> + *
> + * Note: we can't calculate each index like dir_e_read can because we don't
> + * have the leaf, and therefore we don't have the depth, and therefore we
> + * don't have the length. So we have to just read enough ahead to make up
> + * for the loss of information. */
> +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
> +			       struct file_ra_state *f_ra)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count;
> +
> +	/* First check if we've already read-ahead for the whole range. */
> +	if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
> +		return;
> +
> +	f_ra->start = max((pgoff_t)index, f_ra->start);
> +	for (count = 0; count < MAX_RA_BLOCKS; count++) {
> +		if (f_ra->start >= hsize) /* if exceeded the hash table */
> +			break;
> +
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
> +		f_ra->start++;
> +		if (blocknr == last)
> +			continue;
> +
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
>  
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
> @@ -1388,7 +1434,7 @@ out:
>   */
>  
>  static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
> -		      filldir_t filldir)
> +		      filldir_t filldir, struct file_ra_state *f_ra)
>  {
>  	struct gfs2_inode *dip = GFS2_I(inode);
>  	u32 hsize, len = 0;
> @@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  	hash = gfs2_dir_offset2hash(*offset);
>  	index = hash >> (32 - dip->i_depth);
>  
> +	if (f_ra && dip->i_hash_cache == NULL)
> +		f_ra->start = 0;
>  	lp = gfs2_dir_get_hash_table(dip);
>  	if (IS_ERR(lp))
>  		return PTR_ERR(lp);
>  
> +	gfs2_dir_readahead(inode, hsize, index, f_ra);
> +
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
>  					   &copied, &depth,
> @@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  }
>  
>  int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
> -		  filldir_t filldir)
> +		  filldir_t filldir, struct file_ra_state *f_ra)
>  {
>  	struct gfs2_inode *dip = GFS2_I(inode);
>  	struct gfs2_sbd *sdp = GFS2_SB(inode);
> @@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
>  		return 0;
>  
>  	if (dip->i_diskflags & GFS2_DIF_EXHASH)
> -		return dir_e_read(inode, offset, opaque, filldir);
> +		return dir_e_read(inode, offset, opaque, filldir, f_ra);
>  
>  	if (!gfs2_is_stuffed(dip)) {
>  		gfs2_consist_inode(dip);
> diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
> index ff5772f..98c960b 100644
> --- a/fs/gfs2/dir.h
> +++ b/fs/gfs2/dir.h
> @@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
>  			const struct gfs2_inode *ip);
>  extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
>  extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
> -			 filldir_t filldir);
> +			 filldir_t filldir, struct file_ra_state *f_ra);
>  extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
>  			  const struct gfs2_inode *nip, unsigned int new_type);
>  
> diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
> index fe9945f..a581cd2 100644
> --- a/fs/gfs2/export.c
> +++ b/fs/gfs2/export.c
> @@ -118,7 +118,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
>  	if (error)
>  		return error;
>  
> -	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
> +	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);
>  
>  	gfs2_glock_dq_uninit(&gh);
>  
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 92c3db4..c9cbb5f 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -96,7 +96,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
>  		return error;
>  	}
>  
> -	error = gfs2_dir_read(dir, &offset, dirent, filldir);
> +	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
>  
>  	gfs2_glock_dq_uninit(&d_gh);
>  




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-24  8:17               ` Steven Whitehouse
@ 2011-10-24 19:56                 ` Bob Peterson
  2011-10-25  7:48                   ` Steven Whitehouse
  0 siblings, 1 reply; 12+ messages in thread
From: Bob Peterson @ 2011-10-24 19:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Hi,
| 
| That looks much better, but why not create & pass a file_ra_state in
| the
| NFS getname function so that you don't have to check for it being
| NULL?
| Since that function always reads everything up until the name it is
| looking for, it is an ideal place to do readahead as the access will
| always be sequential.
| 
| Otherwise, it looks good to me,
| 
| Steve.

Hi,

There isn't a file struct passed in to the nfs get_name function, but
I can do something like the following.  Is this what you mean?

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index a581cd2..70ba891 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	struct gfs2_holder gh;
 	u64 offset = 0;
 	int error;
+	struct file_ra_state f_ra = { .start = 0 };
 
 	if (!dir)
 		return -EINVAL;
@@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	if (error)
 		return error;
 
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
 
 	gfs2_glock_dq_uninit(&gh);
 
Regards,

Bob Peterson
Red Hat File Systems



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-24 19:56                 ` Bob Peterson
@ 2011-10-25  7:48                   ` Steven Whitehouse
  2011-10-27 16:16                     ` Bob Peterson
  0 siblings, 1 reply; 12+ messages in thread
From: Steven Whitehouse @ 2011-10-25  7:48 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Mon, 2011-10-24 at 15:56 -0400, Bob Peterson wrote:
> ----- Original Message -----
> | Hi,
> | 
> | That looks much better, but why not create & pass a file_ra_state in
> | the
> | NFS getname function so that you don't have to check for it being
> | NULL?
> | Since that function always reads everything up until the name it is
> | looking for, it is an ideal place to do readahead as the access will
> | always be sequential.
> | 
> | Otherwise, it looks good to me,
> | 
> | Steve.
> 
> Hi,
> 
> There isn't a file struct passed in to the nfs get_name function, but
> I can do something like the following.  Is this what you mean?
> 
> diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
> index a581cd2..70ba891 100644
> --- a/fs/gfs2/export.c
> +++ b/fs/gfs2/export.c
> @@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
>  	struct gfs2_holder gh;
>  	u64 offset = 0;
>  	int error;
> +	struct file_ra_state f_ra = { .start = 0 };
>  
>  	if (!dir)
>  		return -EINVAL;
> @@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
>  	if (error)
>  		return error;
>  
> -	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);
> +	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
>  
>  	gfs2_glock_dq_uninit(&gh);
>  
> Regards,
> 
> Bob Peterson
> Red Hat File Systems

Yes, thats the kind of things. Looks like we can use
file_ra_state_init() since it is exported too,

Steve.




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
  2011-10-25  7:48                   ` Steven Whitehouse
@ 2011-10-27 16:16                     ` Bob Peterson
  0 siblings, 0 replies; 12+ messages in thread
From: Bob Peterson @ 2011-10-27 16:16 UTC (permalink / raw)
  To: cluster-devel.redhat.com

----- Original Message -----
| Hi,
| 
| > Hi,
| > 
| > There isn't a file struct passed in to the nfs get_name function,
| > but
| > I can do something like the following.  Is this what you mean?
| > 
| > diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
| > index a581cd2..70ba891 100644
| > --- a/fs/gfs2/export.c
| > +++ b/fs/gfs2/export.c
| > @@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent,
| > char *name,
| >  	struct gfs2_holder gh;
| >  	u64 offset = 0;
| >  	int error;
| > +	struct file_ra_state f_ra = { .start = 0 };
| >  
| >  	if (!dir)
| >  		return -EINVAL;
| > @@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent,
| > char *name,
| >  	if (error)
| >  		return error;
| >  
| > -	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir,
| > NULL);
| > +	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir,
| > &f_ra);
| >  
| >  	gfs2_glock_dq_uninit(&gh);
| >  
| > Regards,
| > 
| > Bob Peterson
| > Red Hat File Systems
| 
| Yes, thats the kind of things. Looks like we can use
| file_ra_state_init() since it is exported too,
| 
| Steve.

Hi,

I'd rather not call file_ra_state_init in this case.
Here is my latest patch for upstream GFS2 then.

Regards,

Bob Peterson
Red Hat File Systems
--
commit 17c4434902790347b4dd6abfdc158aa5a443f91b
Author: Bob Peterson <rpeterso@redhat.com>
Date:   Fri Oct 7 11:55:31 2011 -0500

    GFS2: Add readahead to sequential directory traversal
    
    This patch adds read-ahead capability to GFS2's
    directory hash table management.  It greatly improves
    performance for some directory operations.  For example:
    In one of my file systems that has 1000 directories, each
    of which has 1000 files, time to execute a recursive
    ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
    from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..30405d4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -76,6 +76,8 @@
 #define IS_LEAF     1 /* Hashed (leaf) directory */
 #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
 
+#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
+
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
 
@@ -1376,6 +1378,50 @@ out:
 	return error;
 }
 
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
+			       struct file_ra_state *f_ra)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count;
+
+	/* First check if we've already read-ahead for the whole range. */
+	if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
+		return;
+
+	f_ra->start = max((pgoff_t)index, f_ra->start);
+	for (count = 0; count < MAX_RA_BLOCKS; count++) {
+		if (f_ra->start >= hsize) /* if exceeded the hash table */
+			break;
+
+		last = blocknr;
+		blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
+		f_ra->start++;
+		if (blocknr == last)
+			continue;
+
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
 
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1388,7 +1434,7 @@ out:
  */
 
 static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
-		      filldir_t filldir)
+		      filldir_t filldir, struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	u32 hsize, len = 0;
@@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	hash = gfs2_dir_offset2hash(*offset);
 	index = hash >> (32 - dip->i_depth);
 
+	if (f_ra && dip->i_hash_cache == NULL)
+		f_ra->start = 0;
 	lp = gfs2_dir_get_hash_table(dip);
 	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 
+	gfs2_dir_readahead(inode, hsize, index, f_ra);
+
 	while (index < hsize) {
 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
 					   &copied, &depth,
@@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 }
 
 int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir)
+		  filldir_t filldir, struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		return 0;
 
 	if (dip->i_diskflags & GFS2_DIF_EXHASH)
-		return dir_e_read(inode, offset, opaque, filldir);
+		return dir_e_read(inode, offset, opaque, filldir, f_ra);
 
 	if (!gfs2_is_stuffed(dip)) {
 		gfs2_consist_inode(dip);
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index ff5772f..98c960b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
 			const struct gfs2_inode *ip);
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-			 filldir_t filldir);
+			 filldir_t filldir, struct file_ra_state *f_ra);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 			  const struct gfs2_inode *nip, unsigned int new_type);
 
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index fe9945f..70ba891 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	struct gfs2_holder gh;
 	u64 offset = 0;
 	int error;
+	struct file_ra_state f_ra = { .start = 0 };
 
 	if (!dir)
 		return -EINVAL;
@@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	if (error)
 		return error;
 
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
 
 	gfs2_glock_dq_uninit(&gh);
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 92c3db4..c9cbb5f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -96,7 +96,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
 		return error;
 	}
 
-	error = gfs2_dir_read(dir, &offset, dirent, filldir);
+	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
 
 	gfs2_glock_dq_uninit(&d_gh);
 



^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2011-10-27 16:16 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <db5d593f-00ad-45ad-b300-c018d8588a44@zmail06.collab.prod.int.phx2.redhat.com>
2011-10-04 16:39 ` [Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal Bob Peterson
2011-10-05  9:09   ` Steven Whitehouse
2011-10-06 16:15     ` Bob Peterson
2011-10-07 11:02       ` Steven Whitehouse
2011-10-07 16:01         ` Bob Peterson
2011-10-08 11:13           ` Steven Whitehouse
2011-10-10  8:49           ` Steven Whitehouse
2011-10-21 16:53             ` Bob Peterson
2011-10-24  8:17               ` Steven Whitehouse
2011-10-24 19:56                 ` Bob Peterson
2011-10-25  7:48                   ` Steven Whitehouse
2011-10-27 16:16                     ` Bob Peterson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).