cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] [GFS2] Faster gfs2_bitfit algorithm
@ 2008-03-08  5:12 Bob Peterson
  2008-03-10  8:38 ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Peterson @ 2008-03-08  5:12 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This version of the gfs2_bitfit algorithm is up to four
times faster than its predecessor.

Regards,

Bob Peterson

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
--
 fs/gfs2/rgrp.c |   79 +++++++++++++++++++++++++++++++++----------------------
 1 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4291375..4dee88b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -33,6 +33,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#if BITS_PER_LONG == 32
+#define LBITMASK   (unsigned long)(0x55555555)
+#define LBITSKIP55 (unsigned long)(0x55555555)
+#define LBITSKIP00 (unsigned long)(0x00000000)
+#else
+#define LBITMASK   (unsigned long)(0x5555555555555555)
+#define LBITSKIP55 (unsigned long)(0x5555555555555555)
+#define LBITSKIP00 (unsigned long)(0x0000000000000000)
+#endif
+
 /*
  * These routines are used by the resource group routines (rgrp.c)
  * to keep track of block allocation.  Each block is represented by two
@@ -138,43 +148,48 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
 static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
 		       u8 old_state)
 {
-	const u8 *byte;
-	u32 blk = goal;
-	unsigned int bit, bitlong;
-	const unsigned long *plong;
-#if BITS_PER_LONG == 32
-	const unsigned long plong55 = 0x55555555;
-#else
-	const unsigned long plong55 = 0x5555555555555555;
-#endif
-
-	byte = buffer + (goal / GFS2_NBBY);
-	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
-	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	bitlong = bit;
-
-	while (byte < buffer + buflen) {
-
-		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
-			plong++;
-			byte += sizeof(unsigned long);
-			blk += sizeof(unsigned long) * GFS2_NBBY;
-			continue;
+	const u8 *byte, *start, *end;
+	int bit, startbit;
+	u32 g1, g2, misaligned;
+	unsigned long *plong, lskipval;
+	unsigned long lskipvals[4] = {LBITSKIP55, LBITSKIP00,
+				      LBITSKIP55, LBITSKIP00};
+
+	lskipval = lskipvals[old_state];
+	g1 = (goal / GFS2_NBBY);
+	start = buffer + g1;
+	byte = start;
+        end = buffer + buflen;
+	g2 = ALIGN(g1, sizeof(unsigned long));
+	plong = (unsigned long *)(buffer + g2);
+	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+	misaligned = g2 - g1;
+	while (byte < end) {
+
+		if (bit == 0 && !misaligned) {
+			if (((*plong) & LBITMASK) == lskipval) {
+				plong++;
+				byte += sizeof(unsigned long);
+				continue;
+			}
+		}
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+			return goal +
+				(((byte - start) * GFS2_NBBY) +
+				 ((bit - startbit) >> 1));
 		}
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-			return blk;
 		bit += GFS2_BIT_SIZE;
-		if (bit >= 8) {
+		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
 			bit = 0;
 			byte++;
+			/* If we were misaligned, adjust the counter */
+			if (misaligned)
+				misaligned--;
+			else { /* If we were aligned, we're not anymore. */
+				misaligned += sizeof(unsigned long) - 1;
+				plong++;
+			}
 		}
-		bitlong += GFS2_BIT_SIZE;
-		if (bitlong >= sizeof(unsigned long) * 8) {
-			bitlong = 0;
-			plong++;
-		}
-
-		blk++;
 	}
 
 	return BFITNOENT;




^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2] Faster gfs2_bitfit algorithm
  2008-03-08  5:12 [Cluster-devel] [GFS2] Faster gfs2_bitfit algorithm Bob Peterson
@ 2008-03-10  8:38 ` Steven Whitehouse
  0 siblings, 0 replies; 4+ messages in thread
From: Steven Whitehouse @ 2008-03-10  8:38 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Fri, 2008-03-07 at 23:12 -0600, Bob Peterson wrote:
> Hi,
> 
> This version of the gfs2_bitfit algorithm is up to four
> times faster than its predecessor.
> 
> Regards,
> 
> Bob Peterson
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> --
>  fs/gfs2/rgrp.c |   79 +++++++++++++++++++++++++++++++++----------------------
>  1 files changed, 47 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 4291375..4dee88b 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -33,6 +33,16 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#if BITS_PER_LONG == 32
> +#define LBITMASK   (unsigned long)(0x55555555)
> +#define LBITSKIP55 (unsigned long)(0x55555555)
> +#define LBITSKIP00 (unsigned long)(0x00000000)
> +#else
> +#define LBITMASK   (unsigned long)(0x5555555555555555)
> +#define LBITSKIP55 (unsigned long)(0x5555555555555555)
> +#define LBITSKIP00 (unsigned long)(0x0000000000000000)
> +#endif
> +
You can just use the UL suffix rather than a cast. Also if you'd used
sizeof(unsigned long) == 4 then that would have worked for your
userspace tests as well.

>  /*
>   * These routines are used by the resource group routines (rgrp.c)
>   * to keep track of block allocation.  Each block is represented by two
> @@ -138,43 +148,48 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
>  static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
>  		       u8 old_state)
>  {
> -	const u8 *byte;
> -	u32 blk = goal;
> -	unsigned int bit, bitlong;
> -	const unsigned long *plong;
> -#if BITS_PER_LONG == 32
> -	const unsigned long plong55 = 0x55555555;
> -#else
> -	const unsigned long plong55 = 0x5555555555555555;
> -#endif
> -
> -	byte = buffer + (goal / GFS2_NBBY);
> -	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
> -	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
> -	bitlong = bit;
> -
> -	while (byte < buffer + buflen) {
> -
> -		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
> -			plong++;
> -			byte += sizeof(unsigned long);
> -			blk += sizeof(unsigned long) * GFS2_NBBY;
> -			continue;
> +	const u8 *byte, *start, *end;
> +	int bit, startbit;
> +	u32 g1, g2, misaligned;
> +	unsigned long *plong, lskipval;
> +	unsigned long lskipvals[4] = {LBITSKIP55, LBITSKIP00,
> +				      LBITSKIP55, LBITSKIP00};
> +
It looks like you don't actually need an array here, but just a
conditional based upon one of the bits of "old_state" ? Also I think
lskipval can be marked const.

> +	lskipval = lskipvals[old_state];
> +	g1 = (goal / GFS2_NBBY);
> +	start = buffer + g1;
> +	byte = start;
> +        end = buffer + buflen;
> +	g2 = ALIGN(g1, sizeof(unsigned long));
> +	plong = (unsigned long *)(buffer + g2);
> +	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
> +	misaligned = g2 - g1;
> +	while (byte < end) {
> +
> +		if (bit == 0 && !misaligned) {
> +			if (((*plong) & LBITMASK) == lskipval) {
> +				plong++;
> +				byte += sizeof(unsigned long);
> +				continue;
> +			}
> +		}
Now this is the bit I'm most worried about. We ought to be able to get
rid of the test for misaligned simply by testing for that once at the
start of the loop and jumping either to the "unsigned long at a time" or
"byte at a time" parts of the loop. Also it should be possible to get
rid of the increment of the byte counter by updating it based on the
plong pointer when the loop has terminated for some reason.

As per the note in the bz, it ought to be possible to only have two
tests in this loop: have we hit the end of the buffer, and have we found
what we are looking for.

Did you try out using prefetch(), and/or unrolling the loop by testing
two (or more) unsigned longs at once? See <linux/prefetch.h>

Having said that, the results that you are seeing are a significant
improvement on what was there before and I think should make a
worthwhile difference to the speed of our block allocation,

Steve.




^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2] Faster gfs2_bitfit algorithm
@ 2008-03-10 23:17 Bob Peterson
  2008-03-11 10:33 ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Peterson @ 2008-03-10 23:17 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This version of the gfs2_bitfit algorithm includes the latest
suggestions from Steve Whitehouse.  It is typically eight to
ten times faster than the version we're using today.  If there
is a lot of metadata mixed in (lots of small files) the
algorithm is often 15 times faster, and given the right
conditions, I've seen peaks of 20 times faster.

Regards,

Bob Peterson

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
--
 fs/gfs2/rgrp.c |   93 ++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4291375..7e8f0b1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/prefetch.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -33,6 +34,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
 
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
 /*
  * These routines are used by the resource group routines (rgrp.c)
  * to keep track of block allocation.  Each block is represented by two
@@ -138,45 +149,63 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
 static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
 		       u8 old_state)
 {
-	const u8 *byte;
-	u32 blk = goal;
-	unsigned int bit, bitlong;
-	const unsigned long *plong;
-#if BITS_PER_LONG == 32
-	const unsigned long plong55 = 0x55555555;
-#else
-	const unsigned long plong55 = 0x5555555555555555;
-#endif
-
-	byte = buffer + (goal / GFS2_NBBY);
-	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
-	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	bitlong = bit;
-
-	while (byte < buffer + buflen) {
-
-		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
-			plong++;
-			byte += sizeof(unsigned long);
-			blk += sizeof(unsigned long) * GFS2_NBBY;
-			continue;
+	const u8 *byte, *start, *end;
+	int bit, startbit;
+	u32 g1, g2, misaligned;
+	unsigned long *plong;
+	unsigned long lskipval;
+
+	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
+	g1 = (goal / GFS2_NBBY);
+	start = buffer + g1;
+	byte = start;
+        end = buffer + buflen;
+	g2 = ALIGN(g1, sizeof(unsigned long));
+	plong = (unsigned long *)(buffer + g2);
+	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+	misaligned = g2 - g1;
+	if (!misaligned)
+		goto ulong_aligned;
+/* parse the bitmap a byte@a time */
+misaligned:
+	while (byte < end) {
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+			return goal +
+				(((byte - start) * GFS2_NBBY) +
+				 ((bit - startbit) >> 1));
 		}
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-			return blk;
 		bit += GFS2_BIT_SIZE;
-		if (bit >= 8) {
+		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
 			bit = 0;
 			byte++;
+			misaligned--;
+			if (!misaligned) {
+				plong = (unsigned long *)byte;
+				goto ulong_aligned;
+			}
 		}
-		bitlong += GFS2_BIT_SIZE;
-		if (bitlong >= sizeof(unsigned long) * 8) {
-			bitlong = 0;
-			plong++;
-		}
-
-		blk++;
 	}
+	return BFITNOENT;
 
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
+	   We could "if" it but we'd lose some of the performance gained.
+	   This way will only slow down searching the very last 4/8 bytes
+	   depending on architecture.  I've experimented with several ways
+	   of writing this section such as using an else before the goto
+	   but this one seems to be the fastest. */
+	while ((unsigned char *)plong < end - 1) {
+		prefetch(plong + 1);
+		if (((*plong) & LBITMASK) != lskipval)
+			break;
+		plong++;
+	}
+	if ((unsigned char *)plong < end) {
+		byte = (const u8 *)plong;
+		misaligned += sizeof(unsigned long) - 1;
+		goto misaligned;
+	}
 	return BFITNOENT;
 }
 




^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2] Faster gfs2_bitfit algorithm
  2008-03-10 23:17 Bob Peterson
@ 2008-03-11 10:33 ` Steven Whitehouse
  0 siblings, 0 replies; 4+ messages in thread
From: Steven Whitehouse @ 2008-03-11 10:33 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

Thats a worthwhile improvement. Now in the -nmw git tree. Thanks,

Steve.

On Mon, 2008-03-10 at 18:17 -0500, Bob Peterson wrote:
> Hi,
> 
> This version of the gfs2_bitfit algorithm includes the latest
> suggestions from Steve Whitehouse.  It is typically eight to
> ten times faster than the version we're using today.  If there
> is a lot of metadata mixed in (lots of small files) the
> algorithm is often 15 times faster, and given the right
> conditions, I've seen peaks of 20 times faster.
> 
> Regards,
> 
> Bob Peterson
> 
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> --
>  fs/gfs2/rgrp.c |   93 ++++++++++++++++++++++++++++++++++++-------------------
>  1 files changed, 61 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 4291375..7e8f0b1 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -14,6 +14,7 @@
>  #include <linux/fs.h>
>  #include <linux/gfs2_ondisk.h>
>  #include <linux/lm_interface.h>
> +#include <linux/prefetch.h>
>  
>  #include "gfs2.h"
>  #include "incore.h"
> @@ -33,6 +34,16 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#if BITS_PER_LONG == 32
> +#define LBITMASK   (0x55555555UL)
> +#define LBITSKIP55 (0x55555555UL)
> +#define LBITSKIP00 (0x00000000UL)
> +#else
> +#define LBITMASK   (0x5555555555555555UL)
> +#define LBITSKIP55 (0x5555555555555555UL)
> +#define LBITSKIP00 (0x0000000000000000UL)
> +#endif
> +
>  /*
>   * These routines are used by the resource group routines (rgrp.c)
>   * to keep track of block allocation.  Each block is represented by two
> @@ -138,45 +149,63 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
>  static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
>  		       u8 old_state)
>  {
> -	const u8 *byte;
> -	u32 blk = goal;
> -	unsigned int bit, bitlong;
> -	const unsigned long *plong;
> -#if BITS_PER_LONG == 32
> -	const unsigned long plong55 = 0x55555555;
> -#else
> -	const unsigned long plong55 = 0x5555555555555555;
> -#endif
> -
> -	byte = buffer + (goal / GFS2_NBBY);
> -	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
> -	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
> -	bitlong = bit;
> -
> -	while (byte < buffer + buflen) {
> -
> -		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
> -			plong++;
> -			byte += sizeof(unsigned long);
> -			blk += sizeof(unsigned long) * GFS2_NBBY;
> -			continue;
> +	const u8 *byte, *start, *end;
> +	int bit, startbit;
> +	u32 g1, g2, misaligned;
> +	unsigned long *plong;
> +	unsigned long lskipval;
> +
> +	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
> +	g1 = (goal / GFS2_NBBY);
> +	start = buffer + g1;
> +	byte = start;
> +        end = buffer + buflen;
> +	g2 = ALIGN(g1, sizeof(unsigned long));
> +	plong = (unsigned long *)(buffer + g2);
> +	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
> +	misaligned = g2 - g1;
> +	if (!misaligned)
> +		goto ulong_aligned;
> +/* parse the bitmap a byte at a time */
> +misaligned:
> +	while (byte < end) {
> +		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
> +			return goal +
> +				(((byte - start) * GFS2_NBBY) +
> +				 ((bit - startbit) >> 1));
>  		}
> -		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
> -			return blk;
>  		bit += GFS2_BIT_SIZE;
> -		if (bit >= 8) {
> +		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
>  			bit = 0;
>  			byte++;
> +			misaligned--;
> +			if (!misaligned) {
> +				plong = (unsigned long *)byte;
> +				goto ulong_aligned;
> +			}
>  		}
> -		bitlong += GFS2_BIT_SIZE;
> -		if (bitlong >= sizeof(unsigned long) * 8) {
> -			bitlong = 0;
> -			plong++;
> -		}
> -
> -		blk++;
>  	}
> +	return BFITNOENT;
>  
> +/* parse the bitmap a unsigned long at a time */
> +ulong_aligned:
> +	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
> +	   We could "if" it but we'd lose some of the performance gained.
> +	   This way will only slow down searching the very last 4/8 bytes
> +	   depending on architecture.  I've experimented with several ways
> +	   of writing this section such as using an else before the goto
> +	   but this one seems to be the fastest. */
> +	while ((unsigned char *)plong < end - 1) {
> +		prefetch(plong + 1);
> +		if (((*plong) & LBITMASK) != lskipval)
> +			break;
> +		plong++;
> +	}
> +	if ((unsigned char *)plong < end) {
> +		byte = (const u8 *)plong;
> +		misaligned += sizeof(unsigned long) - 1;
> +		goto misaligned;
> +	}
>  	return BFITNOENT;
>  }
>  
> 
> 



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-03-11 10:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-08  5:12 [Cluster-devel] [GFS2] Faster gfs2_bitfit algorithm Bob Peterson
2008-03-10  8:38 ` Steven Whitehouse
  -- strict thread matches above, loose matches on Subject: below --
2008-03-10 23:17 Bob Peterson
2008-03-11 10:33 ` Steven Whitehouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).