* [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
@ 2022-02-17 10:34 ` Thomas Zimmermann
0 siblings, 0 replies; 20+ messages in thread
From: Thomas Zimmermann @ 2022-02-17 10:34 UTC (permalink / raw)
To: daniel, deller, javierm, geert; +Cc: linux-fbdev, Thomas Zimmermann, dri-devel
Improve the performance of sys_imageblit() by manually unrolling
the inner blitting loop and moving some invariants out. The compiler
failed to do this automatically. The resulting binary code was even
slower than the cfb_imageblit() helper, which uses the same algorithm,
but operates on I/O memory.
A microbenchmark measures the average number of CPU cycles
for sys_imageblit() after a stabilizing period of a few minutes
(i7-4790, FullHD, simpledrm, kernel with debugging). The value
for CFB is given as a reference.
sys_imageblit(), new: 25934 cycles
sys_imageblit(), old: 35944 cycles
cfb_imageblit(): 30566 cycles
In the optimized case, sys_imageblit() is now ~30% faster than before
and ~20% faster than cfb_imageblit().
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
---
drivers/video/fbdev/core/sysimgblt.c | 51 +++++++++++++++++++++-------
1 file changed, 39 insertions(+), 12 deletions(-)
diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
index a4d05b1b17d7..d70d65af6fcb 100644
--- a/drivers/video/fbdev/core/sysimgblt.c
+++ b/drivers/video/fbdev/core/sysimgblt.c
@@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
{
u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
- u32 bit_mask, end_mask, eorx, shift;
+ u32 bit_mask, eorx;
const char *s = image->data, *src;
u32 *dst;
- const u32 *tab = NULL;
- int i, j, k;
+ const u32 *tab;
+ size_t tablen;
+ u32 colortab[16];
+ int i, j, k, jdecr;
+
+ if ((uintptr_t)dst1 % 8)
+ return;
switch (bpp) {
case 8:
tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
+ tablen = 16;
break;
case 16:
tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
+ tablen = 4;
break;
case 32:
- default:
tab = cfb_tab32;
+ tablen = 2;
break;
+ default:
+ return;
}
for (i = ppw-1; i--; ) {
@@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
bit_mask = (1 << ppw) - 1;
eorx = fgx ^ bgx;
k = image->width/ppw;
+ jdecr = 8 / ppw;
+
+ for (i = 0; i < tablen; ++i)
+ colortab[i] = (tab[i] & eorx) ^ bgx;
for (i = image->height; i--; ) {
dst = dst1;
- shift = 8;
src = s;
- for (j = k; j--; ) {
- shift -= ppw;
- end_mask = tab[(*src >> shift) & bit_mask];
- *dst++ = (end_mask & eorx) ^ bgx;
- if (!shift) {
- shift = 8;
- src++;
+ for (j = k; j; j -= jdecr, ++src) {
+ switch (ppw) {
+ case 4: /* 8 bpp */
+ *dst++ = colortab[(*src >> 4) & bit_mask];
+ *dst++ = colortab[(*src >> 0) & bit_mask];
+ break;
+ case 2: /* 16 bpp */
+ *dst++ = colortab[(*src >> 6) & bit_mask];
+ *dst++ = colortab[(*src >> 4) & bit_mask];
+ *dst++ = colortab[(*src >> 2) & bit_mask];
+ *dst++ = colortab[(*src >> 0) & bit_mask];
+ break;
+ case 1: /* 32 bpp */
+ *dst++ = colortab[(*src >> 7) & bit_mask];
+ *dst++ = colortab[(*src >> 6) & bit_mask];
+ *dst++ = colortab[(*src >> 5) & bit_mask];
+ *dst++ = colortab[(*src >> 4) & bit_mask];
+ *dst++ = colortab[(*src >> 3) & bit_mask];
+ *dst++ = colortab[(*src >> 2) & bit_mask];
+ *dst++ = colortab[(*src >> 1) & bit_mask];
+ *dst++ = colortab[(*src >> 0) & bit_mask];
+ break;
}
}
dst1 += p->fix.line_length;
--
2.34.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
2022-02-17 10:34 ` Thomas Zimmermann
@ 2022-02-17 11:05 ` Gerd Hoffmann
-1 siblings, 0 replies; 20+ messages in thread
From: Gerd Hoffmann @ 2022-02-17 11:05 UTC (permalink / raw)
To: Thomas Zimmermann; +Cc: daniel, deller, javierm, geert, linux-fbdev, dri-devel
> - for (j = k; j--; ) {
> - shift -= ppw;
> - end_mask = tab[(*src >> shift) & bit_mask];
> - *dst++ = (end_mask & eorx) ^ bgx;
> - if (!shift) {
> - shift = 8;
> - src++;
> + for (j = k; j; j -= jdecr, ++src) {
> + switch (ppw) {
> + case 4: /* 8 bpp */
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> + case 2: /* 16 bpp */
> + *dst++ = colortab[(*src >> 6) & bit_mask];
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 2) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> + case 1: /* 32 bpp */
> + *dst++ = colortab[(*src >> 7) & bit_mask];
> + *dst++ = colortab[(*src >> 6) & bit_mask];
> + *dst++ = colortab[(*src >> 5) & bit_mask];
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 3) & bit_mask];
> + *dst++ = colortab[(*src >> 2) & bit_mask];
> + *dst++ = colortab[(*src >> 1) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> }
How about moving the switch out of the loop, i.e.
switch (ppw) {
case 4:
for (j = ...) {
*dst++ = colortab[(*src >> 4) & bit_mask];
*dst++ = colortab[(*src >> 0) & bit_mask];
}
[ ... ]
}
?
take care,
Gerd
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
@ 2022-02-17 11:05 ` Gerd Hoffmann
0 siblings, 0 replies; 20+ messages in thread
From: Gerd Hoffmann @ 2022-02-17 11:05 UTC (permalink / raw)
To: Thomas Zimmermann; +Cc: linux-fbdev, deller, javierm, dri-devel, geert
> - for (j = k; j--; ) {
> - shift -= ppw;
> - end_mask = tab[(*src >> shift) & bit_mask];
> - *dst++ = (end_mask & eorx) ^ bgx;
> - if (!shift) {
> - shift = 8;
> - src++;
> + for (j = k; j; j -= jdecr, ++src) {
> + switch (ppw) {
> + case 4: /* 8 bpp */
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> + case 2: /* 16 bpp */
> + *dst++ = colortab[(*src >> 6) & bit_mask];
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 2) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> + case 1: /* 32 bpp */
> + *dst++ = colortab[(*src >> 7) & bit_mask];
> + *dst++ = colortab[(*src >> 6) & bit_mask];
> + *dst++ = colortab[(*src >> 5) & bit_mask];
> + *dst++ = colortab[(*src >> 4) & bit_mask];
> + *dst++ = colortab[(*src >> 3) & bit_mask];
> + *dst++ = colortab[(*src >> 2) & bit_mask];
> + *dst++ = colortab[(*src >> 1) & bit_mask];
> + *dst++ = colortab[(*src >> 0) & bit_mask];
> + break;
> }
How about moving the switch out of the loop, i.e.
switch (ppw) {
case 4:
for (j = ...) {
*dst++ = colortab[(*src >> 4) & bit_mask];
*dst++ = colortab[(*src >> 0) & bit_mask];
}
[ ... ]
}
?
take care,
Gerd
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
2022-02-17 11:05 ` Gerd Hoffmann
@ 2022-02-17 12:08 ` Thomas Zimmermann
-1 siblings, 0 replies; 20+ messages in thread
From: Thomas Zimmermann @ 2022-02-17 12:08 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: daniel, deller, javierm, geert, linux-fbdev, dri-devel
[-- Attachment #1.1: Type: text/plain, Size: 1912 bytes --]
Hi
Am 17.02.22 um 12:05 schrieb Gerd Hoffmann:
>> - for (j = k; j--; ) {
>> - shift -= ppw;
>> - end_mask = tab[(*src >> shift) & bit_mask];
>> - *dst++ = (end_mask & eorx) ^ bgx;
>> - if (!shift) {
>> - shift = 8;
>> - src++;
>> + for (j = k; j; j -= jdecr, ++src) {
>> + switch (ppw) {
>> + case 4: /* 8 bpp */
>> + *dst++ = colortab[(*src >> 4) & bit_mask];
>> + *dst++ = colortab[(*src >> 0) & bit_mask];
>> + break;
>> + case 2: /* 16 bpp */
>> + *dst++ = colortab[(*src >> 6) & bit_mask];
>> + *dst++ = colortab[(*src >> 4) & bit_mask];
>> + *dst++ = colortab[(*src >> 2) & bit_mask];
>> + *dst++ = colortab[(*src >> 0) & bit_mask];
>> + break;
>> + case 1: /* 32 bpp */
>> + *dst++ = colortab[(*src >> 7) & bit_mask];
>> + *dst++ = colortab[(*src >> 6) & bit_mask];
>> + *dst++ = colortab[(*src >> 5) & bit_mask];
>> + *dst++ = colortab[(*src >> 4) & bit_mask];
>> + *dst++ = colortab[(*src >> 3) & bit_mask];
>> + *dst++ = colortab[(*src >> 2) & bit_mask];
>> + *dst++ = colortab[(*src >> 1) & bit_mask];
>> + *dst++ = colortab[(*src >> 0) & bit_mask];
>> + break;
>> }
>
> How about moving the switch out of the loop, i.e.
>
> switch (ppw) {
> case 4:
> for (j = ...) {
> *dst++ = colortab[(*src >> 4) & bit_mask];
> *dst++ = colortab[(*src >> 0) & bit_mask];
> }
> [ ... ]
> }
>
> ?
No difference. Values for the microbenchmark (rdtsc around
sys_imageblit()) and the directory listing stabilize at the same
numbers. I'll still go with you suggestion, because the code is more
readable.
Best regards
Thomas
>
> take care,
> Gerd
>
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
@ 2022-02-17 12:08 ` Thomas Zimmermann
0 siblings, 0 replies; 20+ messages in thread
From: Thomas Zimmermann @ 2022-02-17 12:08 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: linux-fbdev, deller, javierm, dri-devel, geert
[-- Attachment #1.1: Type: text/plain, Size: 1912 bytes --]
Hi
Am 17.02.22 um 12:05 schrieb Gerd Hoffmann:
>> - for (j = k; j--; ) {
>> - shift -= ppw;
>> - end_mask = tab[(*src >> shift) & bit_mask];
>> - *dst++ = (end_mask & eorx) ^ bgx;
>> - if (!shift) {
>> - shift = 8;
>> - src++;
>> + for (j = k; j; j -= jdecr, ++src) {
>> + switch (ppw) {
>> + case 4: /* 8 bpp */
>> + *dst++ = colortab[(*src >> 4) & bit_mask];
>> + *dst++ = colortab[(*src >> 0) & bit_mask];
>> + break;
>> + case 2: /* 16 bpp */
>> + *dst++ = colortab[(*src >> 6) & bit_mask];
>> + *dst++ = colortab[(*src >> 4) & bit_mask];
>> + *dst++ = colortab[(*src >> 2) & bit_mask];
>> + *dst++ = colortab[(*src >> 0) & bit_mask];
>> + break;
>> + case 1: /* 32 bpp */
>> + *dst++ = colortab[(*src >> 7) & bit_mask];
>> + *dst++ = colortab[(*src >> 6) & bit_mask];
>> + *dst++ = colortab[(*src >> 5) & bit_mask];
>> + *dst++ = colortab[(*src >> 4) & bit_mask];
>> + *dst++ = colortab[(*src >> 3) & bit_mask];
>> + *dst++ = colortab[(*src >> 2) & bit_mask];
>> + *dst++ = colortab[(*src >> 1) & bit_mask];
>> + *dst++ = colortab[(*src >> 0) & bit_mask];
>> + break;
>> }
>
> How about moving the switch out of the loop, i.e.
>
> switch (ppw) {
> case 4:
> for (j = ...) {
> *dst++ = colortab[(*src >> 4) & bit_mask];
> *dst++ = colortab[(*src >> 0) & bit_mask];
> }
> [ ... ]
> }
>
> ?
No difference. Values for the microbenchmark (rdtsc around
sys_imageblit()) and the directory listing stabilize at the same
numbers. I'll still go with you suggestion, because the code is more
readable.
Best regards
Thomas
>
> take care,
> Gerd
>
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
2022-02-17 10:34 ` Thomas Zimmermann
@ 2022-02-18 9:24 ` Javier Martinez Canillas
-1 siblings, 0 replies; 20+ messages in thread
From: Javier Martinez Canillas @ 2022-02-18 9:24 UTC (permalink / raw)
To: Thomas Zimmermann, daniel, deller, geert; +Cc: dri-devel, linux-fbdev
Hello Thomas,
On 2/17/22 11:34, Thomas Zimmermann wrote:
> Improve the performance of sys_imageblit() by manually unrolling
> the inner blitting loop and moving some invariants out. The compiler
> failed to do this automatically. The resulting binary code was even
> slower than the cfb_imageblit() helper, which uses the same algorithm,
> but operates on I/O memory.
>
> A microbenchmark measures the average number of CPU cycles
> for sys_imageblit() after a stabilizing period of a few minutes
> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
> for CFB is given as a reference.
>
> sys_imageblit(), new: 25934 cycles
> sys_imageblit(), old: 35944 cycles
> cfb_imageblit(): 30566 cycles
>
> In the optimized case, sys_imageblit() is now ~30% faster than before
> and ~20% faster than cfb_imageblit().
>
> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
> ---
This patch looks good to me as well.
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Best regards,
--
Javier Martinez Canillas
Linux Engineering
Red Hat
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
@ 2022-02-18 9:24 ` Javier Martinez Canillas
0 siblings, 0 replies; 20+ messages in thread
From: Javier Martinez Canillas @ 2022-02-18 9:24 UTC (permalink / raw)
To: Thomas Zimmermann, daniel, deller, geert; +Cc: linux-fbdev, dri-devel
Hello Thomas,
On 2/17/22 11:34, Thomas Zimmermann wrote:
> Improve the performance of sys_imageblit() by manually unrolling
> the inner blitting loop and moving some invariants out. The compiler
> failed to do this automatically. The resulting binary code was even
> slower than the cfb_imageblit() helper, which uses the same algorithm,
> but operates on I/O memory.
>
> A microbenchmark measures the average number of CPU cycles
> for sys_imageblit() after a stabilizing period of a few minutes
> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
> for CFB is given as a reference.
>
> sys_imageblit(), new: 25934 cycles
> sys_imageblit(), old: 35944 cycles
> cfb_imageblit(): 30566 cycles
>
> In the optimized case, sys_imageblit() is now ~30% faster than before
> and ~20% faster than cfb_imageblit().
>
> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
> ---
This patch looks good to me as well.
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Best regards,
--
Javier Martinez Canillas
Linux Engineering
Red Hat
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
2022-02-17 10:34 ` Thomas Zimmermann
@ 2022-02-18 10:14 ` Sam Ravnborg
-1 siblings, 0 replies; 20+ messages in thread
From: Sam Ravnborg @ 2022-02-18 10:14 UTC (permalink / raw)
To: Thomas Zimmermann; +Cc: daniel, deller, javierm, geert, linux-fbdev, dri-devel
Hi Thomas,
On Thu, Feb 17, 2022 at 11:34:05AM +0100, Thomas Zimmermann wrote:
> Improve the performance of sys_imageblit() by manually unrolling
> the inner blitting loop and moving some invariants out. The compiler
> failed to do this automatically. The resulting binary code was even
> slower than the cfb_imageblit() helper, which uses the same algorithm,
> but operates on I/O memory.
It would be super to have the same optimization done to cfb_imageblit(),
to prevent that the two codebases diverge more than necessary.
Also I think cfb_ version would also see a performance gain from this.
The actual implementation looks good.
So with or without the extra un-rolling the patch is:
Acked-by: Sam Ravnborg <sam@ravnborg.org>
One small nit belwo.
Sam
>
> A microbenchmark measures the average number of CPU cycles
> for sys_imageblit() after a stabilizing period of a few minutes
> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
> for CFB is given as a reference.
>
> sys_imageblit(), new: 25934 cycles
> sys_imageblit(), old: 35944 cycles
> cfb_imageblit(): 30566 cycles
>
> In the optimized case, sys_imageblit() is now ~30% faster than before
> and ~20% faster than cfb_imageblit().
>
> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
> ---
> drivers/video/fbdev/core/sysimgblt.c | 51 +++++++++++++++++++++-------
> 1 file changed, 39 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
> index a4d05b1b17d7..d70d65af6fcb 100644
> --- a/drivers/video/fbdev/core/sysimgblt.c
> +++ b/drivers/video/fbdev/core/sysimgblt.c
> @@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
> {
> u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
> u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
> - u32 bit_mask, end_mask, eorx, shift;
> + u32 bit_mask, eorx;
> const char *s = image->data, *src;
> u32 *dst;
> - const u32 *tab = NULL;
> - int i, j, k;
> + const u32 *tab;
> + size_t tablen;
> + u32 colortab[16];
> + int i, j, k, jdecr;
> +
> + if ((uintptr_t)dst1 % 8)
> + return;
This check is new - and should not trigger ever. Maybe add an unlikely
and a WARN_ON_ONCE()?
>
> switch (bpp) {
> case 8:
> tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
> + tablen = 16;
> break;
> case 16:
> tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
> + tablen = 4;
> break;
> case 32:
> - default:
> tab = cfb_tab32;
> + tablen = 2;
> break;
> + default:
> + return;
> }
>
> for (i = ppw-1; i--; ) {
> @@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
> bit_mask = (1 << ppw) - 1;
> eorx = fgx ^ bgx;
> k = image->width/ppw;
> + jdecr = 8 / ppw;
> +
> + for (i = 0; i < tablen; ++i)
> + colortab[i] = (tab[i] & eorx) ^ bgx;
This code could have been embedded with the switch (bpp) {
That would have made some sense I think.
But both ways works, so this was just a small observation.
Sam
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
@ 2022-02-18 10:14 ` Sam Ravnborg
0 siblings, 0 replies; 20+ messages in thread
From: Sam Ravnborg @ 2022-02-18 10:14 UTC (permalink / raw)
To: Thomas Zimmermann; +Cc: linux-fbdev, deller, javierm, dri-devel, geert
Hi Thomas,
On Thu, Feb 17, 2022 at 11:34:05AM +0100, Thomas Zimmermann wrote:
> Improve the performance of sys_imageblit() by manually unrolling
> the inner blitting loop and moving some invariants out. The compiler
> failed to do this automatically. The resulting binary code was even
> slower than the cfb_imageblit() helper, which uses the same algorithm,
> but operates on I/O memory.
It would be super to have the same optimization done to cfb_imageblit(),
to prevent that the two codebases diverge more than necessary.
Also I think cfb_ version would also see a performance gain from this.
The actual implementation looks good.
So with or without the extra un-rolling the patch is:
Acked-by: Sam Ravnborg <sam@ravnborg.org>
One small nit belwo.
Sam
>
> A microbenchmark measures the average number of CPU cycles
> for sys_imageblit() after a stabilizing period of a few minutes
> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
> for CFB is given as a reference.
>
> sys_imageblit(), new: 25934 cycles
> sys_imageblit(), old: 35944 cycles
> cfb_imageblit(): 30566 cycles
>
> In the optimized case, sys_imageblit() is now ~30% faster than before
> and ~20% faster than cfb_imageblit().
>
> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
> ---
> drivers/video/fbdev/core/sysimgblt.c | 51 +++++++++++++++++++++-------
> 1 file changed, 39 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
> index a4d05b1b17d7..d70d65af6fcb 100644
> --- a/drivers/video/fbdev/core/sysimgblt.c
> +++ b/drivers/video/fbdev/core/sysimgblt.c
> @@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
> {
> u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
> u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
> - u32 bit_mask, end_mask, eorx, shift;
> + u32 bit_mask, eorx;
> const char *s = image->data, *src;
> u32 *dst;
> - const u32 *tab = NULL;
> - int i, j, k;
> + const u32 *tab;
> + size_t tablen;
> + u32 colortab[16];
> + int i, j, k, jdecr;
> +
> + if ((uintptr_t)dst1 % 8)
> + return;
This check is new - and should not trigger ever. Maybe add an unlikely
and a WARN_ON_ONCE()?
>
> switch (bpp) {
> case 8:
> tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
> + tablen = 16;
> break;
> case 16:
> tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
> + tablen = 4;
> break;
> case 32:
> - default:
> tab = cfb_tab32;
> + tablen = 2;
> break;
> + default:
> + return;
> }
>
> for (i = ppw-1; i--; ) {
> @@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
> bit_mask = (1 << ppw) - 1;
> eorx = fgx ^ bgx;
> k = image->width/ppw;
> + jdecr = 8 / ppw;
> +
> + for (i = 0; i < tablen; ++i)
> + colortab[i] = (tab[i] & eorx) ^ bgx;
This code could have been embedded with the switch (bpp) {
That would have made some sense I think.
But both ways works, so this was just a small observation.
Sam
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
2022-02-18 10:14 ` Sam Ravnborg
@ 2022-02-18 14:09 ` Thomas Zimmermann
-1 siblings, 0 replies; 20+ messages in thread
From: Thomas Zimmermann @ 2022-02-18 14:09 UTC (permalink / raw)
To: Sam Ravnborg; +Cc: daniel, deller, javierm, geert, linux-fbdev, dri-devel
[-- Attachment #1.1: Type: text/plain, Size: 3724 bytes --]
Hi Sam
Am 18.02.22 um 11:14 schrieb Sam Ravnborg:
> Hi Thomas,
>
> On Thu, Feb 17, 2022 at 11:34:05AM +0100, Thomas Zimmermann wrote:
>> Improve the performance of sys_imageblit() by manually unrolling
>> the inner blitting loop and moving some invariants out. The compiler
>> failed to do this automatically. The resulting binary code was even
>> slower than the cfb_imageblit() helper, which uses the same algorithm,
>> but operates on I/O memory.
>
> It would be super to have the same optimization done to cfb_imageblit(),
> to prevent that the two codebases diverge more than necessary.
> Also I think cfb_ version would also see a performance gain from this.
Yes, I can do that.
>
> The actual implementation looks good.
> So with or without the extra un-rolling the patch is:
> Acked-by: Sam Ravnborg <sam@ravnborg.org>
>
> One small nit belwo.
>
> Sam
>
>>
>> A microbenchmark measures the average number of CPU cycles
>> for sys_imageblit() after a stabilizing period of a few minutes
>> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
>> for CFB is given as a reference.
>>
>> sys_imageblit(), new: 25934 cycles
>> sys_imageblit(), old: 35944 cycles
>> cfb_imageblit(): 30566 cycles
>>
>> In the optimized case, sys_imageblit() is now ~30% faster than before
>> and ~20% faster than cfb_imageblit().
>>
>> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
>> ---
>> drivers/video/fbdev/core/sysimgblt.c | 51 +++++++++++++++++++++-------
>> 1 file changed, 39 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
>> index a4d05b1b17d7..d70d65af6fcb 100644
>> --- a/drivers/video/fbdev/core/sysimgblt.c
>> +++ b/drivers/video/fbdev/core/sysimgblt.c
>> @@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
>> {
>> u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
>> u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
>> - u32 bit_mask, end_mask, eorx, shift;
>> + u32 bit_mask, eorx;
>> const char *s = image->data, *src;
>> u32 *dst;
>> - const u32 *tab = NULL;
>> - int i, j, k;
>> + const u32 *tab;
>> + size_t tablen;
>> + u32 colortab[16];
>> + int i, j, k, jdecr;
>> +
>> + if ((uintptr_t)dst1 % 8)
>> + return;
> This check is new - and should not trigger ever. Maybe add an unlikely
> and a WARN_ON_ONCE()?
I think I can remove this test. It was supposed to tell the compiler
that dst1 is 8-aligned, but I don't think it worked.
Best regards
Thomas
>
>
>>
>> switch (bpp) {
>> case 8:
>> tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
>> + tablen = 16;
>> break;
>> case 16:
>> tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
>> + tablen = 4;
>> break;
>> case 32:
>> - default:
>> tab = cfb_tab32;
>> + tablen = 2;
>> break;
>> + default:
>> + return;
>> }
>>
>> for (i = ppw-1; i--; ) {
>> @@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
>> bit_mask = (1 << ppw) - 1;
>> eorx = fgx ^ bgx;
>> k = image->width/ppw;
>> + jdecr = 8 / ppw;
>> +
>> + for (i = 0; i < tablen; ++i)
>> + colortab[i] = (tab[i] & eorx) ^ bgx;
> This code could have been embedded with the switch (bpp) {
> That would have made some sense I think.
> But both ways works, so this was just a small observation.
>
> Sam
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/2] fbdev: Improve performance of sys_imageblit()
@ 2022-02-18 14:09 ` Thomas Zimmermann
0 siblings, 0 replies; 20+ messages in thread
From: Thomas Zimmermann @ 2022-02-18 14:09 UTC (permalink / raw)
To: Sam Ravnborg; +Cc: linux-fbdev, deller, javierm, dri-devel, geert
[-- Attachment #1.1: Type: text/plain, Size: 3724 bytes --]
Hi Sam
Am 18.02.22 um 11:14 schrieb Sam Ravnborg:
> Hi Thomas,
>
> On Thu, Feb 17, 2022 at 11:34:05AM +0100, Thomas Zimmermann wrote:
>> Improve the performance of sys_imageblit() by manually unrolling
>> the inner blitting loop and moving some invariants out. The compiler
>> failed to do this automatically. The resulting binary code was even
>> slower than the cfb_imageblit() helper, which uses the same algorithm,
>> but operates on I/O memory.
>
> It would be super to have the same optimization done to cfb_imageblit(),
> to prevent that the two codebases diverge more than necessary.
> Also I think cfb_ version would also see a performance gain from this.
Yes, I can do that.
>
> The actual implementation looks good.
> So with or without the extra un-rolling the patch is:
> Acked-by: Sam Ravnborg <sam@ravnborg.org>
>
> One small nit belwo.
>
> Sam
>
>>
>> A microbenchmark measures the average number of CPU cycles
>> for sys_imageblit() after a stabilizing period of a few minutes
>> (i7-4790, FullHD, simpledrm, kernel with debugging). The value
>> for CFB is given as a reference.
>>
>> sys_imageblit(), new: 25934 cycles
>> sys_imageblit(), old: 35944 cycles
>> cfb_imageblit(): 30566 cycles
>>
>> In the optimized case, sys_imageblit() is now ~30% faster than before
>> and ~20% faster than cfb_imageblit().
>>
>> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
>> ---
>> drivers/video/fbdev/core/sysimgblt.c | 51 +++++++++++++++++++++-------
>> 1 file changed, 39 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/video/fbdev/core/sysimgblt.c b/drivers/video/fbdev/core/sysimgblt.c
>> index a4d05b1b17d7..d70d65af6fcb 100644
>> --- a/drivers/video/fbdev/core/sysimgblt.c
>> +++ b/drivers/video/fbdev/core/sysimgblt.c
>> @@ -188,23 +188,32 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
>> {
>> u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel;
>> u32 ppw = 32/bpp, spitch = (image->width + 7)/8;
>> - u32 bit_mask, end_mask, eorx, shift;
>> + u32 bit_mask, eorx;
>> const char *s = image->data, *src;
>> u32 *dst;
>> - const u32 *tab = NULL;
>> - int i, j, k;
>> + const u32 *tab;
>> + size_t tablen;
>> + u32 colortab[16];
>> + int i, j, k, jdecr;
>> +
>> + if ((uintptr_t)dst1 % 8)
>> + return;
> This check is new - and should not trigger ever. Maybe add an unlikely
> and a WARN_ON_ONCE()?
I think I can remove this test. It was supposed to tell the compiler
that dst1 is 8-aligned, but I don't think it worked.
Best regards
Thomas
>
>
>>
>> switch (bpp) {
>> case 8:
>> tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le;
>> + tablen = 16;
>> break;
>> case 16:
>> tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
>> + tablen = 4;
>> break;
>> case 32:
>> - default:
>> tab = cfb_tab32;
>> + tablen = 2;
>> break;
>> + default:
>> + return;
>> }
>>
>> for (i = ppw-1; i--; ) {
>> @@ -217,19 +226,37 @@ static void fast_imageblit(const struct fb_image *image, struct fb_info *p,
>> bit_mask = (1 << ppw) - 1;
>> eorx = fgx ^ bgx;
>> k = image->width/ppw;
>> + jdecr = 8 / ppw;
>> +
>> + for (i = 0; i < tablen; ++i)
>> + colortab[i] = (tab[i] & eorx) ^ bgx;
> This code could have been embedded with the switch (bpp) {
> That would have made some sense I think.
> But both ways works, so this was just a small observation.
>
> Sam
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Ivo Totev
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 840 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread