From mboxrd@z Thu Jan 1 00:00:00 1970 From: Knut Petersen Subject: Re: [PATCH 1/1 2.6.13] framebuffer: bit_putcs() optimization for 8x* fonts Date: Wed, 31 Aug 2005 14:46:35 +0200 Message-ID: <4315A6AB.5090108@t-online.de> References: <43148610.70406@t-online.de> <43149E5B.7040006@t-online.de> <4314DD2E.7060901@t-online.de> Reply-To: linux-fbdev-devel@lists.sourceforge.net Mime-Version: 1.0 Content-Transfer-Encoding: quoted-printable Return-path: Received: from sc8-sf-mx1-b.sourceforge.net ([10.3.1.91] helo=mail.sourceforge.net) by sc8-sf-list1.sourceforge.net with esmtp (Exim 4.30) id 1EARwK-0008Lo-P6 for linux-fbdev-devel@lists.sourceforge.net; Wed, 31 Aug 2005 05:43:40 -0700 Received: from mailout08.sul.t-online.com ([194.25.134.20]) by mail.sourceforge.net with esmtp (Exim 4.44) id 1EARwK-0002nn-1H for linux-fbdev-devel@lists.sourceforge.net; Wed, 31 Aug 2005 05:43:40 -0700 In-Reply-To: Sender: linux-fbdev-devel-admin@lists.sourceforge.net Errors-To: linux-fbdev-devel-admin@lists.sourceforge.net List-Unsubscribe: , List-Id: List-Post: List-Help: List-Subscribe: , List-Archive: Content-Type: text/plain; charset="iso-8859-1"; format="flowed" To: Andrew Morton Cc: Roman Zippel , linux-fbdev-devel@lists.sourceforge.net, "Antonino A. Daplas" , Linux Kernel Development , Jochen Hein , Geert Uytterhoeven >Something like below, which has the advantange that there is still only=20 >one implementation of the function > True, that=B4s a great advantage. > and if it's still slower, we really need to check the compiler > =20 > Please have a look at the following patch. It takes your idea of=20 inlining but moves the special cases into the macro, speeding things up for the very likely=20 case of s_pitch =3D=3D 1 and the less likely case of s_pitch of 2. Treating s_pit= ch=20 =3D=3D 2 special gives a still significant performance improvement of more than 10 % for=20 16x30 fonts. This way also bit_putcs looks better again ... Andrew, as this way is better than and still as fast as my first=20 approach I think framebuffer-bit_putcs-optimization-for-8x.patch should be reverted and th= e following patch should be applied instead. Antonino, Roman, Geert, do you agree? cu, knut diff -uprN -X linux/Documentation/dontdiff -x '*.bak' -x '*.ctx' linuxori= g/drivers/video/console/bitblit.c linux/drivers/video/console/bitblit.c --- linuxorig/drivers/video/console/bitblit.c 2005-08-29 01:41:01.0000000= 00 +0200 +++ linux/drivers/video/console/bitblit.c 2005-08-31 10:06:22.000000000 += 0200 @@ -175,7 +175,7 @@ static void bit_putcs(struct vc_data *vc src =3D buf; } =20 - fb_pad_aligned_buffer(dst, pitch, src, idx, image.height); + __fb_pad_aligned_buffer(dst, pitch, src, idx, image.height); dst +=3D width; } } diff -uprN -X linux/Documentation/dontdiff -x '*.bak' -x '*.ctx' linuxori= g/drivers/video/fbmem.c linux/drivers/video/fbmem.c --- linuxorig/drivers/video/fbmem.c 2005-08-29 01:41:01.000000000 +0200 +++ linux/drivers/video/fbmem.c 2005-08-31 13:36:16.000000000 +0200 @@ -80,15 +80,7 @@ EXPORT_SYMBOL(fb_get_color_depth); */ void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u= 32 height) { - int i, j; - - for (i =3D height; i--; ) { - /* s_pitch is a few bytes at the most, memcpy is suboptimal */ - for (j =3D 0; j < s_pitch; j++) - dst[j] =3D src[j]; - src +=3D s_pitch; - dst +=3D d_pitch; - } + __fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, height); } EXPORT_SYMBOL(fb_pad_aligned_buffer); =20 diff -uprN -X linux/Documentation/dontdiff -x '*.bak' -x '*.ctx' linuxori= g/include/linux/fb.h linux/include/linux/fb.h --- linuxorig/include/linux/fb.h 2005-08-29 01:41:01.000000000 +0200 +++ linux/include/linux/fb.h 2005-08-31 12:45:08.000000000 +0200 @@ -824,6 +824,38 @@ extern int fb_get_color_depth(struct fb_ extern int fb_get_options(char *name, char **option); extern int fb_new_modelist(struct fb_info *info); =20 + +/* + * Don't change without testing performance of framebuffer + * bitblitting. Inlining is necessary for performance reasons. + * Although the code might not _look_ fast because of some + * multiplications, it really _is_ fast as it is easier for gcc + * to optimize well. + */ + +static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src= ,=20 + u32 s_pitch, u32 height) +{ + int i, j; + + if (likely(s_pitch=3D=3D1)) + for(i=3D0; i < height; i++) + dst[d_pitch*i] =3D src[i]; + else if (s_pitch=3D=3D2) + for(i=3D0; i < height; i++) { + *(u16 *)dst =3D ((u16 *)src)[i]; + dst +=3D d_pitch; + } + else { + d_pitch -=3D s_pitch; + for (i =3D height; i--; ) { + for (j =3D 0; j < s_pitch; j++) + *dst++ =3D *src++; + dst +=3D d_pitch; + } + } +} + extern struct fb_info *registered_fb[FB_MAX]; extern int num_registered_fb; =20 ------------------------------------------------------- SF.Net email is Sponsored by the Better Software Conference & EXPO September 19-22, 2005 * San Francisco, CA * Development Lifecycle Practic= es Agile & Plan-Driven Development * Managing Projects & Teams * Testing & Q= A Security * Process Improvement & Measurement * http://www.sqe.com/bsce5sf From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S964779AbVHaMnp (ORCPT ); Wed, 31 Aug 2005 08:43:45 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S964781AbVHaMnp (ORCPT ); Wed, 31 Aug 2005 08:43:45 -0400 Received: from mailout08.sul.t-online.com ([194.25.134.20]:40873 "EHLO mailout08.sul.t-online.com") by vger.kernel.org with ESMTP id S964779AbVHaMno (ORCPT ); Wed, 31 Aug 2005 08:43:44 -0400 Message-ID: <4315A6AB.5090108@t-online.de> Date: Wed, 31 Aug 2005 14:46:35 +0200 From: Knut Petersen User-Agent: Mozilla/5.0 (X11; U; Linux i686; de-AT; rv:1.7.7) Gecko/20050414 X-Accept-Language: de, en MIME-Version: 1.0 To: Andrew Morton CC: Roman Zippel , linux-fbdev-devel@lists.sourceforge.net, "Antonino A. Daplas" , Linux Kernel Development , Jochen Hein , Geert Uytterhoeven Subject: Re: [Linux-fbdev-devel] [PATCH 1/1 2.6.13] framebuffer: bit_putcs() optimization for 8x* fonts References: <43148610.70406@t-online.de> <43149E5B.7040006@t-online.de> <4314DD2E.7060901@t-online.de> In-Reply-To: X-Enigmail-Version: 0.86.0.0 X-Enigmail-Supports: pgp-inline, pgp-mime Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 8bit X-ID: r1nJcwZLQefk4mlst25LyUti4CZcuSF16fi3jkRKsDl7p7I41e1R4n@t-dialin.net X-TOI-MSGID: 45818708-13f4-42c1-b0ec-d20e6da22fda Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org >Something like below, which has the advantange that there is still only >one implementation of the function > True, thatīs a great advantage. > and if it's still slower, we really need to check the compiler > > Please have a look at the following patch. It takes your idea of inlining but moves the special cases into the macro, speeding things up for the very likely case of s_pitch == 1 and the less likely case of s_pitch of 2. Treating s_pitch == 2 special gives a still significant performance improvement of more than 10 % for 16x30 fonts. This way also bit_putcs looks better again ... Andrew, as this way is better than and still as fast as my first approach I think framebuffer-bit_putcs-optimization-for-8x.patch should be reverted and the following patch should be applied instead. Antonino, Roman, Geert, do you agree? cu, knut diff -uprN -X linux/Documentation/dontdiff -x '*.bak' -x '*.ctx' linuxorig/drivers/video/console/bitblit.c linux/drivers/video/console/bitblit.c --- linuxorig/drivers/video/console/bitblit.c 2005-08-29 01:41:01.000000000 +0200 +++ linux/drivers/video/console/bitblit.c 2005-08-31 10:06:22.000000000 +0200 @@ -175,7 +175,7 @@ static void bit_putcs(struct vc_data *vc src = buf; } - fb_pad_aligned_buffer(dst, pitch, src, idx, image.height); + __fb_pad_aligned_buffer(dst, pitch, src, idx, image.height); dst += width; } } diff -uprN -X linux/Documentation/dontdiff -x '*.bak' -x '*.ctx' linuxorig/drivers/video/fbmem.c linux/drivers/video/fbmem.c --- linuxorig/drivers/video/fbmem.c 2005-08-29 01:41:01.000000000 +0200 +++ linux/drivers/video/fbmem.c 2005-08-31 13:36:16.000000000 +0200 @@ -80,15 +80,7 @@ EXPORT_SYMBOL(fb_get_color_depth); */ void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) { - int i, j; - - for (i = height; i--; ) { - /* s_pitch is a few bytes at the most, memcpy is suboptimal */ - for (j = 0; j < s_pitch; j++) - dst[j] = src[j]; - src += s_pitch; - dst += d_pitch; - } + __fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, height); } EXPORT_SYMBOL(fb_pad_aligned_buffer); diff -uprN -X linux/Documentation/dontdiff -x '*.bak' -x '*.ctx' linuxorig/include/linux/fb.h linux/include/linux/fb.h --- linuxorig/include/linux/fb.h 2005-08-29 01:41:01.000000000 +0200 +++ linux/include/linux/fb.h 2005-08-31 12:45:08.000000000 +0200 @@ -824,6 +824,38 @@ extern int fb_get_color_depth(struct fb_ extern int fb_get_options(char *name, char **option); extern int fb_new_modelist(struct fb_info *info); + +/* + * Don't change without testing performance of framebuffer + * bitblitting. Inlining is necessary for performance reasons. + * Although the code might not _look_ fast because of some + * multiplications, it really _is_ fast as it is easier for gcc + * to optimize well. + */ + +static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, + u32 s_pitch, u32 height) +{ + int i, j; + + if (likely(s_pitch==1)) + for(i=0; i < height; i++) + dst[d_pitch*i] = src[i]; + else if (s_pitch==2) + for(i=0; i < height; i++) { + *(u16 *)dst = ((u16 *)src)[i]; + dst += d_pitch; + } + else { + d_pitch -= s_pitch; + for (i = height; i--; ) { + for (j = 0; j < s_pitch; j++) + *dst++ = *src++; + dst += d_pitch; + } + } +} + extern struct fb_info *registered_fb[FB_MAX]; extern int num_registered_fb;