All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] zlib: Optimize inffast when copying direct from output
@ 2009-11-12  9:04 Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-12  9:04 UTC (permalink / raw)
  To: roel kluin, akpm, Richard Purdie, linux-kernel; +Cc: Joakim Tjernlund

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the direct copy procedure.
Uses get_unaligned() but only in one place.
The copy loop just above this one can also use this
optimization, but I havn't done so as I have not tested if it
is a win there too.
On my MPC8321 this is about 17% faster on my JFFS2 root FS
than the original.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
---

 This version replaces all previous versions.
 Changes:
 - Fix aligment check (Roel Kluin)
 - Fix problem for LE targets.

 arch/powerpc/boot/Makefile |    4 ++-
 lib/zlib_inflate/inffast.c |   55 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9ae7b7e..98e4c4f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,7 +20,7 @@
 all: $(obj)/zImage
 
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-		 -fno-strict-aliasing -Os -msoft-float -pipe \
+		 -fno-strict-aliasing -Os -msoft-float -pipe -D__KERNEL__\
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
 		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
 BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
@@ -34,6 +34,8 @@ BOOTCFLAGS	+= -fno-stack-protector
 endif
 
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS	+= -include include/linux/autoconf.h -Iarch/powerpc/include
+BOOTCFLAGS	+= -Iinclude
 
 DTS_FLAGS	?= -p 1024
 
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..c6740ae 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -4,6 +4,8 @@
  */
 
 #include <linux/zutil.h>
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
 #include "inftrees.h"
 #include "inflate.h"
 #include "inffast.h"
@@ -24,9 +26,11 @@
 #ifdef POSTINC
 #  define OFF 0
 #  define PUP(a) *(a)++
+#  define UP_UNALIGNED(a) get_unaligned((a)++)
 #else
 #  define OFF 1
 #  define PUP(a) *++(a)
+#  define UP_UNALIGNED(a) get_unaligned(++(a))
 #endif
 
 /*
@@ -239,18 +243,47 @@ void inflate_fast(z_streamp strm, unsigned start)
                     }
                 }
                 else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1 + OFF) & 1)) {
+			PUP(out) = PUP(from);
+			len--;
+		    }
+		    sout = (unsigned short *)(out - OFF);
+		    if (dist > 2 ) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)(from - OFF);
+			loops = len >> 1;
+			do
+			    PUP(sout) = UP_UNALIGNED(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-2+2*OFF);
+			if (dist == 1)
+#if defined(__BIG_ENDIAN)
+			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
+#elif defined(__LITTLE_ENDIAN)
+			    pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00 ) >> 8);
+#else
+#error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined
+#endif
+			loops = len >> 1;
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+		    }
+		    if (len & 1)
+			PUP(out) = PUP(from);
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread
* [PATCH] zlib: Optimize inffast when copying direct from output
@ 2009-11-10  9:03 Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-10  9:03 UTC (permalink / raw)
  To: linux-mtd; +Cc: Joakim Tjernlund

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the direct copy procedure.
Uses get_unaligned() but only in one place.
The copy loop just above this one can also use this
optimization, but I havn't done so as I have not tested if it
is a win there too.
On my MPC8321 this is about 17% faster on my JFFS2 root FS
than the original.
---

 Would like some testing of the PowerPC boot wrapper and
 a LE target before sending it upstream.

 arch/powerpc/boot/Makefile |    4 ++-
 lib/zlib_inflate/inffast.c |   48 +++++++++++++++++++++++++++++++++----------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 9ae7b7e..98e4c4f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -20,7 +20,7 @@
 all: $(obj)/zImage
 
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-		 -fno-strict-aliasing -Os -msoft-float -pipe \
+		 -fno-strict-aliasing -Os -msoft-float -pipe -D__KERNEL__\
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
 		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
 BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
@@ -34,6 +34,8 @@ BOOTCFLAGS	+= -fno-stack-protector
 endif
 
 BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS	+= -include include/linux/autoconf.h -Iarch/powerpc/include
+BOOTCFLAGS	+= -Iinclude
 
 DTS_FLAGS	?= -p 1024
 
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..0c7fa3d 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/zutil.h>
+#include <asm/unaligned.h>
 #include "inftrees.h"
 #include "inflate.h"
 #include "inffast.h"
@@ -24,9 +25,11 @@
 #ifdef POSTINC
 #  define OFF 0
 #  define PUP(a) *(a)++
+#  define UP_UNALIGNED(a) get_unaligned((a)++)
 #else
 #  define OFF 1
 #  define PUP(a) *++(a)
+#  define UP_UNALIGNED(a) get_unaligned(++(a))
 #endif
 
 /*
@@ -239,18 +242,41 @@ void inflate_fast(z_streamp strm, unsigned start)
                     }
                 }
                 else {
+		    unsigned short *sout;
+		    unsigned long loops;
+
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    /* minimum length is three */
+		    /* Align out addr */
+		    if (!((long)(out - 1 + OFF)) & 1) {
+			PUP(out) = PUP(from);
+			len--;
+		    }
+		    sout = (unsigned short *)(out - OFF);
+		    if (dist > 2 ) {
+			unsigned short *sfrom;
+
+			sfrom = (unsigned short *)(from - OFF);
+			loops = len >> 1;
+			do
+			    PUP(sout) = UP_UNALIGNED(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+		    } else { /* dist == 1 or dist == 2 */
+			unsigned short pat16;
+
+			pat16 = *(sout-2+2*OFF);
+			if (dist == 1)
+			    pat16 = (pat16 & 0xff) | ((pat16 & 0xff ) << 8);
+			loops = len >> 1;
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+		    }
+		    if (len & 1)
+			PUP(out) = PUP(from);
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread
* [PATCH] zlib: Optimize inffast when copying direct from output
@ 2009-11-08  9:53 Joakim Tjernlund
  0 siblings, 0 replies; 8+ messages in thread
From: Joakim Tjernlund @ 2009-11-08  9:53 UTC (permalink / raw)
  To: akpm, Richard Purdie, linux-kernel; +Cc: Joakim Tjernlund

JFFS2 uses lesser compression ratio and inflate always
ends up in "copy direct from output" case.
This patch tries to optimize the copy procedure for
arch's that have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
On my MPC8321 this is about 14% faster on my JFFS2 root FS
than the original.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
---
 lib/zlib_inflate/inffast.c |   35 +++++++++++++++++++++++++++++++++++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 8550b0c..0588fbf 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -240,6 +240,40 @@ void inflate_fast(z_streamp strm, unsigned start)
                 }
                 else {
                     from = out - dist;          /* copy direct from output */
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+                    /* minimum length is three */
+		    if (dist > 2 ) {
+			unsigned short *sout = (unsigned short *)(out - OFF);
+			unsigned short *sfrom = (unsigned short *)(from - OFF);
+			unsigned long loops = len >> 1;
+
+			do
+			    PUP(sout) = PUP(sfrom);
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			from = (unsigned char *)sfrom + OFF;
+			if (len & 1)
+			    PUP(out) = PUP(from);
+		    } else if (dist == 2) {
+			unsigned short *sout = (unsigned short *)(out - OFF);
+			unsigned short pat16;
+			unsigned long loops = len >> 1;
+
+			pat16 = *(sout-2+2*OFF);
+			do
+			    PUP(sout) = pat16;
+			while (--loops);
+			out = (unsigned char *)sout + OFF;
+			if (len & 1)
+			    PUP(out) = PUP(from);
+		    } else {
+			unsigned char pat8 = *(out - 1 + OFF);
+
+			do {
+			    PUP(out) = pat8;
+			} while (--len);
+		    }
+#else
                     do {                        /* minimum length is three */
                         PUP(out) = PUP(from);
                         PUP(out) = PUP(from);
@@ -251,6 +285,7 @@ void inflate_fast(z_streamp strm, unsigned start)
                         if (len > 1)
                             PUP(out) = PUP(from);
                     }
+#endif
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-- 
1.6.4.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2009-11-26  9:06 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <Received>
2009-11-10  9:00 ` [PATCH] zlib: Optimize inffast when copying direct from output Joakim Tjernlund
2009-11-24  3:12   ` Benjamin Herrenschmidt
2009-11-26  8:30     ` Joakim Tjernlund
2009-11-26  8:46       ` Benjamin Herrenschmidt
2009-11-26  9:02         ` Joakim Tjernlund
2009-11-12  9:04 Joakim Tjernlund
  -- strict thread matches above, loose matches on Subject: below --
2009-11-10  9:03 Joakim Tjernlund
2009-11-08  9:53 Joakim Tjernlund

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.