[RFC 2.6.28 1/2] fbdev: add ability to set damage

linux-fbdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC 2.6.28 1/2] fbdev: add ability to set damage
@ 2009-01-15  0:06 Jaya Kumar
  2009-01-15  0:06 ` [RFC 2.6.28 2/2] broadsheetfb: add damage handling Jaya Kumar
  2009-01-15  9:25 ` [RFC 2.6.28 1/2] fbdev: add ability to set damage Tomi Valkeinen
  0 siblings, 2 replies; 18+ messages in thread
From: Jaya Kumar @ 2009-01-15  0:06 UTC (permalink / raw)
  Cc: linux-fbdev-devel, adaplas, Magnus Damm, armbru, lethal,
	Geert Uytterhoeven, Jaya Kumar

Hi Geert, Krzysztof, Magnus, fbdev friends,

I would like to propose this idea about allowing userspace to provide damage
information to drivers. This is just a first pass implementation. Please let
me know your thoughts.

Thanks,
jaya

This patch adds the ability for userspace applications to provide damage
information to the underlying driver. This is useful in scenarios where the
underlying driver can perform transfer optimizations based on knowing
exactly which framebuffer areas that were updated. This functionality is
exposed by using a simple x,y,w,h bounding box structure. Userspace is
expected to perform its damage and then perform the ioctl. The underlying
driver is free to use this information as it sees fit including ignoring it
if it chooses to. An example use case will be provided in the case of
broadsheetfb.c where the damage information is aggregated for deferred use.

Signed-off-by: Jaya Kumar <jayakumar.lkml@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: armbru@redhat.com
Cc: lethal@linux-sh.org
Cc: adaplas@gmail.com
Cc: linux-fbdev-devel@lists.sourceforge.net
---
 drivers/video/fbmem.c |   61 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fb.h    |   24 +++++++++++++++++++
 2 files changed, 85 insertions(+), 0 deletions(-)

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 756efeb..f95ec45 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1006,6 +1006,60 @@ fb_blank(struct fb_info *info, int blank)
  	return ret;
 }
 
+static int fb_alloc_damage(struct fb_damage **damagep, int len)
+{
+	struct fb_damage *damage;
+
+	damage = kzalloc(sizeof(struct fb_damage), GFP_KERNEL);
+	if (!damage)
+		return -ENOMEM;
+
+	damage->len = len;
+	damage->rects = kzalloc(sizeof(struct fb_damage_rect) * len,
+				GFP_KERNEL);
+	if (!damage->rects) {
+		kfree(damage);
+		return -ENOMEM;
+	}
+
+	*damagep = damage;
+	return 0;
+}
+
+static void fb_free_damage(struct fb_damage *damage)
+{
+	if (damage)
+		kfree(damage->rects);
+	kfree(damage);
+}
+
+static int fb_set_damage(struct fb_info *info, struct fb_damage_user *udamage)
+{
+	int ret = -EINVAL;
+	int size = udamage->len;
+	struct fb_damage *damage;
+
+	if (size > FB_DAMAGE_COUNT_MAX)
+		goto fail;
+
+	ret = fb_alloc_damage(&damage, size);
+	if (ret)
+		goto fail;
+
+	if (copy_from_user(damage->rects, udamage->rects,
+				sizeof(struct fb_damage_rect)*size)) {
+		ret = -EFAULT;
+		goto fail2;
+	}
+
+	if (info->fbops->fb_set_damage)
+		return info->fbops->fb_set_damage(info, damage);
+fail2:
+	fb_free_damage(damage);
+fail:
+	return ret;
+}
+
 static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			unsigned long arg)
 {
@@ -1015,6 +1069,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 	struct fb_con2fbmap con2fb;
 	struct fb_cmap_user cmap;
 	struct fb_event event;
+	struct fb_damage_user udamage;
 	void __user *argp = (void __user *)arg;
 	long ret = 0;
 
@@ -1116,6 +1171,12 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 		info->flags &= ~FBINFO_MISC_USEREVENT;
 		release_console_sem();
 		break;;
+	case FBIOPUT_DAMAGE:
+		if (copy_from_user(&udamage, argp, sizeof(udamage)))
+			ret = -EFAULT;
+		else
+			ret = fb_set_damage(info, &udamage);
+		break;
 	default:
 		if (fb->fb_ioctl == NULL)
 			ret = -ENOTTY;
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 1ee63df..8ee5f6d 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -37,7 +37,10 @@ struct dentry;
 #define FBIOGET_HWCINFO         0x4616
 #define FBIOPUT_MODEINFO        0x4617
 #define FBIOGET_DISPINFO        0x4618
+/* to allow userspace to provide screen damage information to drivers */
+#define FBIOPUT_DAMAGE          0x4619
 
+#define FB_DAMAGE_COUNT_MAX	32	/* max number of damage rects */
 
 #define FB_TYPE_PACKED_PIXELS		0	/* Packed Pixels	*/
 #define FB_TYPE_PLANES			1	/* Non interleaved planes */
@@ -357,6 +360,24 @@ struct fb_image {
 	struct fb_cmap cmap;	/* color map info */
 };
 
+struct fb_damage_rect {
+	__u16 x;
+	__u16 y;
+	__u16 w;
+	__u16 h;
+};
+
+struct fb_damage {
+	struct list_head list;
+	__u32 len;		/* Number of entries */
+	struct fb_damage_rect *rects; /* array of damage rectangles */
+};
+
+struct fb_damage_user {
+	__u32 len;		/* Number of entries */
+	struct fb_damage_rect __user *rects; /* array of damage rectangles */
+};
+
 /*
  * hardware cursor control
  */
@@ -672,6 +693,9 @@ struct fb_ops {
 	/* get capability given var */
 	void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps,
 			    struct fb_var_screeninfo *var);
+
+	/* provide damage information */
+	int (*fb_set_damage)(struct fb_info *info, struct fb_damage *damage);
 };
 
 #ifdef CONFIG_FB_TILEBLITTING
-- 
1.5.2.3


------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC 2.6.28 2/2] broadsheetfb: add damage handling
  2009-01-15  0:06 [RFC 2.6.28 1/2] fbdev: add ability to set damage Jaya Kumar
@ 2009-01-15  0:06 ` Jaya Kumar
  2009-01-15  9:25 ` [RFC 2.6.28 1/2] fbdev: add ability to set damage Tomi Valkeinen
  1 sibling, 0 replies; 18+ messages in thread
From: Jaya Kumar @ 2009-01-15  0:06 UTC (permalink / raw)
  Cc: linux-fbdev-devel, adaplas, Magnus Damm, armbru, lethal,
	Geert Uytterhoeven, Jaya Kumar

This patch adds support within broadsheetfb to process damage information
provided by userspace in order to perform more accurate partial updates.

Signed-off-by: Jaya Kumar <jayakumar.lkml@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: armbru@redhat.com
Cc: lethal@linux-sh.org
Cc: adaplas@gmail.com
Cc: linux-fbdev-devel@lists.sourceforge.net
---
 drivers/video/broadsheetfb.c |  106 +++++++++++++++++++++++++++++++++++++++++-
 include/video/broadsheetfb.h |    2 +
 2 files changed, 107 insertions(+), 1 deletions(-)

diff --git a/drivers/video/broadsheetfb.c b/drivers/video/broadsheetfb.c
index 69788b9..98b02ef 100644
--- a/drivers/video/broadsheetfb.c
+++ b/drivers/video/broadsheetfb.c
@@ -293,6 +293,83 @@ static void broadsheetfb_dpy_update(struct broadsheetfb_par *par)
 
 }
 
+static void broadsheetfb_upd_full(struct broadsheetfb_par *par)
+{
+	u16 args[5];
+	args[0] = 0x4300;
+	broadsheet_send_cmdargs(par, BS_CMD_UPD_FULL, 1, args);
+
+	broadsheet_send_command(par, BS_CMD_WAIT_DSPE_TRG);
+
+	broadsheet_send_command(par, BS_CMD_WAIT_DSPE_FREND);
+
+	par->board->wait_for_rdy(par);
+}
+
+static void broadsheetfb_load_image_area(struct broadsheetfb_par *par, u16 x,
+						u16 y, u16 w, u16 h)
+{
+	u16 args[5];
+	unsigned char *sbuf = (unsigned char *)par->info->screen_base;
+	unsigned char *buf;
+	int j;
+
+	/* x must be a multiple of 4 so drop the lower bits */
+	x &= 0xFFFC;
+
+	/* y must be a multiple of 4 so drop the lower bits */
+	y &= 0xFFFC;
+
+	args[0] = 0x3 << 4;
+	args[1] = x;
+	args[2] = y;
+	args[3] = w;
+	args[4] = h;
+	broadsheet_send_cmdargs(par, BS_CMD_LD_IMG_AREA, 5, args);
+
+	args[0] = 0x154;
+	broadsheet_send_cmdargs(par, BS_CMD_WR_REG, 1, args);
+
+	for (j = y; j < y + h; j++) {
+		buf = sbuf + x + (j * par->info->var.xres);
+		broadsheet_burst_write(par, (w+1)/2, (u16 *) buf);
+	}
+	broadsheet_send_command(par, BS_CMD_LD_IMG_END);
+}
+
+static int broadsheetfb_process_damage(struct fb_info *info)
+{
+	struct broadsheetfb_par *par = info->par;
+	struct fb_damage *cur, *next;
+	int ret = -EINVAL;
+	int i;
+	struct fb_damage_rect *rect;
+
+	mutex_lock(&par->damage_lock);
+	/* if there is no damage, then caller has to do work to figure out
+	 * the changes on its own */
+	if (list_empty(&par->damagelist))
+		goto finish;
+
+	list_for_each_entry_safe(cur, next, &par->damagelist, list) {
+		for (i = 0; i < cur->len; i++) {
+			rect = &(cur->rects[i]);
+			broadsheetfb_load_image_area(par, rect->x, rect->y,
+							rect->w, rect->h);
+		}
+		list_del(&cur->list);
+		kfree(cur->rects);
+		kfree(cur);
+	}
+
+	broadsheetfb_upd_full(par);
+	ret = 0;
+finish:
+	mutex_unlock(&par->damage_lock);
+	return ret;
+}
+
+
 /* this is called back from the deferred io workqueue */
 static void broadsheetfb_dpy_deferred_io(struct fb_info *info,
 				struct list_head *pagelist)
@@ -304,6 +381,14 @@ static void broadsheetfb_dpy_deferred_io(struct fb_info *info,
 	int h_inc;
 	u16 yres = info->var.yres;
 	u16 xres = info->var.xres;
+	int ret;
+
+	/* if we have damage data then use it exclusively */
+	ret = broadsheetfb_process_damage(info);
+	if (!ret)
+		return;
+
+	/* if no damage then rely on page information */
 
 	/* height increment is fixed per page */
 	h_inc = DIV_ROUND_UP(PAGE_SIZE , xres);
@@ -414,6 +499,18 @@ static ssize_t broadsheetfb_write(struct fb_info *info, const char __user *buf,
 	return (err) ? err : count;
 }
 
+static int broadsheetfb_set_damage(struct fb_info *info,
+					struct fb_damage *damage)
+{
+	struct broadsheetfb_par *par = info->par;
+
+	mutex_lock(&par->damage_lock);
+	list_add_tail(&damage->list, &par->damagelist);
+	mutex_unlock(&par->damage_lock);
+
+	return 0;
+}
+
 static struct fb_ops broadsheetfb_ops = {
 	.owner		= THIS_MODULE,
 	.fb_read        = fb_sys_read,
@@ -421,6 +518,7 @@ static struct fb_ops broadsheetfb_ops = {
 	.fb_fillrect	= broadsheetfb_fillrect,
 	.fb_copyarea	= broadsheetfb_copyarea,
 	.fb_imageblit	= broadsheetfb_imageblit,
+	.fb_set_damage	= broadsheetfb_set_damage,
 };
 
 static struct fb_deferred_io broadsheetfb_defio = {
@@ -499,6 +597,9 @@ static int __devinit broadsheetfb_probe(struct platform_device *dev)
 
 	broadsheet_init(par);
 
+	INIT_LIST_HEAD(&par->damagelist);
+	mutex_init(&par->damage_lock);
+
 	retval = register_framebuffer(info);
 	if (retval < 0)
 		goto err_free_irq;
@@ -513,8 +614,10 @@ static int __devinit broadsheetfb_probe(struct platform_device *dev)
 
 err_free_irq:
 	board->cleanup(par);
+	mutex_destroy(&par->damage_lock);
 err_cmap:
 	fb_dealloc_cmap(&info->cmap);
+	fb_deferred_io_cleanup(info);
 err_vfree:
 	vfree(videomemory);
 err_fb_rel:
@@ -532,9 +635,10 @@ static int __devexit broadsheetfb_remove(struct platform_device *dev)
 	if (info) {
 		struct broadsheetfb_par *par = info->par;
 		unregister_framebuffer(info);
-		fb_deferred_io_cleanup(info);
 		par->board->cleanup(par);
+		mutex_destroy(&par->damage_lock);
 		fb_dealloc_cmap(&info->cmap);
+		fb_deferred_io_cleanup(info);
 		vfree((void *)info->screen_base);
 		module_put(par->board->owner);
 		framebuffer_release(info);
diff --git a/include/video/broadsheetfb.h b/include/video/broadsheetfb.h
index a758534..5320a04 100644
--- a/include/video/broadsheetfb.h
+++ b/include/video/broadsheetfb.h
@@ -41,6 +41,8 @@ struct broadsheetfb_par {
 	void (*write_reg)(struct broadsheetfb_par *, u16 reg, u16 val);
 	u16 (*read_reg)(struct broadsheetfb_par *, u16 reg);
 	wait_queue_head_t waitq;
+	struct mutex damage_lock;
+	struct list_head damagelist;
 };
 
 /* board specific routines */
-- 
1.5.2.3


------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-15  0:06 [RFC 2.6.28 1/2] fbdev: add ability to set damage Jaya Kumar
  2009-01-15  0:06 ` [RFC 2.6.28 2/2] broadsheetfb: add damage handling Jaya Kumar
@ 2009-01-15  9:25 ` Tomi Valkeinen
  2009-01-15  9:53   ` Jaya Kumar
  1 sibling, 1 reply; 18+ messages in thread
From: Tomi Valkeinen @ 2009-01-15  9:25 UTC (permalink / raw)
  To: ext Jaya Kumar
  Cc: linux-fbdev-devel, adaplas, Magnus Damm, armbru, lethal,
	Geert Uytterhoeven

Hi,

On Thu, 2009-01-15 at 08:06 +0800, ext Jaya Kumar wrote:
> Hi Geert, Krzysztof, Magnus, fbdev friends,
> 
> I would like to propose this idea about allowing userspace to provide damage
> information to drivers. This is just a first pass implementation. Please let
> me know your thoughts.

omapfb does actually something similar with a custom IOCTL,
OMAPFB_UPDATE_WINDOW. If other fbs need similar functionality, then this
sounds good to me.

However, those kallocs give me some shivers. I don't know how fast
kallocs are, so perhaps I'm worrying about nothing. But is such a
dynamic way to pass damaged area needed? omapfb is on the other end, you
can just give one rectangle with it.

I have often been wondering about this, is it better to update one
bigger area in one pass, or multiple smaller areas. I guess there's no
real answer to it, though =).


> Thanks,
> jaya

Tomi

> 
> This patch adds the ability for userspace applications to provide damage
> information to the underlying driver. This is useful in scenarios where the
> underlying driver can perform transfer optimizations based on knowing
> exactly which framebuffer areas that were updated. This functionality is
> exposed by using a simple x,y,w,h bounding box structure. Userspace is
> expected to perform its damage and then perform the ioctl. The underlying
> driver is free to use this information as it sees fit including ignoring it
> if it chooses to. An example use case will be provided in the case of
> broadsheetfb.c where the damage information is aggregated for deferred use.
> 
> Signed-off-by: Jaya Kumar <jayakumar.lkml@gmail.com>
> Cc: Geert Uytterhoeven <geert@linux-m68k.org>
> Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
> Cc: Magnus Damm <magnus.damm@gmail.com>
> Cc: armbru@redhat.com
> Cc: lethal@linux-sh.org
> Cc: adaplas@gmail.com
> Cc: linux-fbdev-devel@lists.sourceforge.net
> ---
>  drivers/video/fbmem.c |   61 +++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/fb.h    |   24 +++++++++++++++++++
>  2 files changed, 85 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
> index 756efeb..f95ec45 100644
> --- a/drivers/video/fbmem.c
> +++ b/drivers/video/fbmem.c
> @@ -1006,6 +1006,60 @@ fb_blank(struct fb_info *info, int blank)
>   	return ret;
>  }
>  
> +static int fb_alloc_damage(struct fb_damage **damagep, int len)
> +{
> +	struct fb_damage *damage;
> +
> +	damage = kzalloc(sizeof(struct fb_damage), GFP_KERNEL);
> +	if (!damage)
> +		return -ENOMEM;
> +
> +	damage->len = len;
> +	damage->rects = kzalloc(sizeof(struct fb_damage_rect) * len,
> +				GFP_KERNEL);
> +	if (!damage->rects) {
> +		kfree(damage);
> +		return -ENOMEM;
> +	}
> +
> +	*damagep = damage;
> +	return 0;
> +}
> +
> +static void fb_free_damage(struct fb_damage *damage)
> +{
> +	if (damage)
> +		kfree(damage->rects);
> +	kfree(damage);
> +}
> +
> +static int fb_set_damage(struct fb_info *info, struct fb_damage_user *udamage)
> +{
> +	int ret = -EINVAL;
> +	int size = udamage->len;
> +	struct fb_damage *damage;
> +
> +	if (size > FB_DAMAGE_COUNT_MAX)
> +		goto fail;
> +
> +	ret = fb_alloc_damage(&damage, size);
> +	if (ret)
> +		goto fail;
> +
> +	if (copy_from_user(damage->rects, udamage->rects,
> +				sizeof(struct fb_damage_rect)*size)) {
> +		ret = -EFAULT;
> +		goto fail2;
> +	}
> +
> +	if (info->fbops->fb_set_damage)
> +		return info->fbops->fb_set_damage(info, damage);
> +fail2:
> +	fb_free_damage(damage);
> +fail:
> +	return ret;
> +}
> +
>  static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
>  			unsigned long arg)
>  {
> @@ -1015,6 +1069,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
>  	struct fb_con2fbmap con2fb;
>  	struct fb_cmap_user cmap;
>  	struct fb_event event;
> +	struct fb_damage_user udamage;
>  	void __user *argp = (void __user *)arg;
>  	long ret = 0;
>  
> @@ -1116,6 +1171,12 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
>  		info->flags &= ~FBINFO_MISC_USEREVENT;
>  		release_console_sem();
>  		break;;
> +	case FBIOPUT_DAMAGE:
> +		if (copy_from_user(&udamage, argp, sizeof(udamage)))
> +			ret = -EFAULT;
> +		else
> +			ret = fb_set_damage(info, &udamage);
> +		break;
>  	default:
>  		if (fb->fb_ioctl == NULL)
>  			ret = -ENOTTY;
> diff --git a/include/linux/fb.h b/include/linux/fb.h
> index 1ee63df..8ee5f6d 100644
> --- a/include/linux/fb.h
> +++ b/include/linux/fb.h
> @@ -37,7 +37,10 @@ struct dentry;
>  #define FBIOGET_HWCINFO         0x4616
>  #define FBIOPUT_MODEINFO        0x4617
>  #define FBIOGET_DISPINFO        0x4618
> +/* to allow userspace to provide screen damage information to drivers */
> +#define FBIOPUT_DAMAGE          0x4619
>  
> +#define FB_DAMAGE_COUNT_MAX	32	/* max number of damage rects */
>  
>  #define FB_TYPE_PACKED_PIXELS		0	/* Packed Pixels	*/
>  #define FB_TYPE_PLANES			1	/* Non interleaved planes */
> @@ -357,6 +360,24 @@ struct fb_image {
>  	struct fb_cmap cmap;	/* color map info */
>  };
>  
> +struct fb_damage_rect {
> +	__u16 x;
> +	__u16 y;
> +	__u16 w;
> +	__u16 h;
> +};
> +
> +struct fb_damage {
> +	struct list_head list;
> +	__u32 len;		/* Number of entries */
> +	struct fb_damage_rect *rects; /* array of damage rectangles */
> +};
> +
> +struct fb_damage_user {
> +	__u32 len;		/* Number of entries */
> +	struct fb_damage_rect __user *rects; /* array of damage rectangles */
> +};
> +
>  /*
>   * hardware cursor control
>   */
> @@ -672,6 +693,9 @@ struct fb_ops {
>  	/* get capability given var */
>  	void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps,
>  			    struct fb_var_screeninfo *var);
> +
> +	/* provide damage information */
> +	int (*fb_set_damage)(struct fb_info *info, struct fb_damage *damage);
>  };
>  
>  #ifdef CONFIG_FB_TILEBLITTING


------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-15  9:25 ` [RFC 2.6.28 1/2] fbdev: add ability to set damage Tomi Valkeinen
@ 2009-01-15  9:53   ` Jaya Kumar
  2009-01-15 10:29     ` Magnus Damm
  0 siblings, 1 reply; 18+ messages in thread
From: Jaya Kumar @ 2009-01-15  9:53 UTC (permalink / raw)
  To: tomi.valkeinen
  Cc: linux-fbdev-devel, adaplas, Magnus Damm, armbru, lethal,
	Geert Uytterhoeven

On Thu, Jan 15, 2009 at 4:25 AM, Tomi Valkeinen
<tomi.valkeinen@nokia.com> wrote:
> Hi,
>
> On Thu, 2009-01-15 at 08:06 +0800, ext Jaya Kumar wrote:
>> Hi Geert, Krzysztof, Magnus, fbdev friends,
>>
>> I would like to propose this idea about allowing userspace to provide damage
>> information to drivers. This is just a first pass implementation. Please let
>> me know your thoughts.
>
> omapfb does actually something similar with a custom IOCTL,
> OMAPFB_UPDATE_WINDOW. If other fbs need similar functionality, then this
> sounds good to me.
>
> However, those kallocs give me some shivers. I don't know how fast
> kallocs are, so perhaps I'm worrying about nothing. But is such a
> dynamic way to pass damaged area needed? omapfb is on the other end, you
> can just give one rectangle with it.

Hi Tomi,

Thanks, currently, I believe that hecubafb, metronomefb and
broadsheetfb would benefit although I've only implemented use of this
in broadsheetfb. I think there is possibility that sh and xen_pvfb can
benefit too. I think we can work on this together to make the API be
sufficiently generic.

Acknowledging that kzalloc is definitely not appropriate for all
drivers, I propose the following changes to the implementation.

a) allow userspace to determine optimal number of rectangles
FBIO_GETDAMAGE
which would allow the driver to report back (in the same fb_damage
structure) the optimal number of rectangles that it can support.

b) allow drivers to handle memory allocation as desired themselves
Instead of doing the copy_from_user and kzalloc in the higher level
fb_set_damage, we pass that user pointer directly to the driver. Thus,
it would be:
int (*fb_set_damage)(struct fb_info *info, struct fb_damage_user *damage);

and the driver can handle according to its needs.

>
> I have often been wondering about this, is it better to update one
> bigger area in one pass, or multiple smaller areas. I guess there's no
> real answer to it, though =).

I agree with you that it becomes dependent on the display controller
and display material that it is updating. In the case of broadsheetfb,
being able to isolate exactly which pixels need to be updated has
significant performance impact due to the fact that each pixel has to
be gpio-ed to the hardware, and the e-paper latency per waveform
update has to be expended as well, which is why rather than a global
bounding box, I had selected support of multiple bounding boxes.

Thanks,
jaya

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-15  9:53   ` Jaya Kumar
@ 2009-01-15 10:29     ` Magnus Damm
  2009-01-15 11:08       ` Jaya Kumar
  0 siblings, 1 reply; 18+ messages in thread
From: Magnus Damm @ 2009-01-15 10:29 UTC (permalink / raw)
  To: Jaya Kumar; +Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

Hi Jaya,

I agree with Tomi about the memory allocation.

On Thu, Jan 15, 2009 at 6:53 PM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
> Acknowledging that kzalloc is definitely not appropriate for all
> drivers, I propose the following changes to the implementation.
>
> a) allow userspace to determine optimal number of rectangles
> FBIO_GETDAMAGE
> which would allow the driver to report back (in the same fb_damage
> structure) the optimal number of rectangles that it can support.
>
> b) allow drivers to handle memory allocation as desired themselves
> Instead of doing the copy_from_user and kzalloc in the higher level
> fb_set_damage, we pass that user pointer directly to the driver. Thus,
> it would be:
> int (*fb_set_damage)(struct fb_info *info, struct fb_damage_user *damage);
>
> and the driver can handle according to its needs.

I wonder how fine grained control that is needed. It's not an exact
science, right? If a slightly larger area is updated than what is
needed then we will take a performance hit, but things should work as
expected apart from that right?

I'm a big fan of simple things like bitmaps. I wonder if it's a good
idea to divide the entire frame buffer into equally sized X*Y tiles
and have a bitmap of dirty bits. A "1" in the bitmap means tile is
dirty and needs update and a "0" means no need to update. The best
tile size is application specific. The size of the bitmap varies of
course with the tile size.

For a 1024x768 display using 32x32 tiles we need 24 32-bit words.
That's pretty small and simple, no?

Cheers,

/ magnus

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-15 10:29     ` Magnus Damm
@ 2009-01-15 11:08       ` Jaya Kumar
  2009-01-16  3:09         ` Magnus Damm
  0 siblings, 1 reply; 18+ messages in thread
From: Jaya Kumar @ 2009-01-15 11:08 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Thu, Jan 15, 2009 at 5:29 AM, Magnus Damm <magnus.damm@gmail.com> wrote:
> Hi Jaya,
>
> I agree with Tomi about the memory allocation.

Yes, I agree with that too. :-) I proposed pushing that decision into
the driver so that it could decide for itself whether it wants to
allocate a buffer or retain a fixed structure.

>
> On Thu, Jan 15, 2009 at 6:53 PM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
>> Acknowledging that kzalloc is definitely not appropriate for all
>> drivers, I propose the following changes to the implementation.
>>
>> a) allow userspace to determine optimal number of rectangles
>> FBIO_GETDAMAGE
>> which would allow the driver to report back (in the same fb_damage
>> structure) the optimal number of rectangles that it can support.
>>
>> b) allow drivers to handle memory allocation as desired themselves
>> Instead of doing the copy_from_user and kzalloc in the higher level
>> fb_set_damage, we pass that user pointer directly to the driver. Thus,
>> it would be:
>> int (*fb_set_damage)(struct fb_info *info, struct fb_damage_user *damage);
>>
>> and the driver can handle according to its needs.
>
> I wonder how fine grained control that is needed. It's not an exact
> science, right? If a slightly larger area is updated than what is

Agreed that it is not a one-approach fits all scenario.

> needed then we will take a performance hit, but things should work as
> expected apart from that right?

I'm not sure I understood this. Why do you say "If a large area is
updated, then we will take a performance hit."? I think that statement
depends on the device, right? I agree that if a lot of pixels are
updated, then there is a lot of data to transfer, but beyond that it
is very much dependent on the device, whether it uses DMA, what kind
of update latency it has, what kind of partial update capability it
has, all of which affect how much of a performance hit is taken and
what the optimal case would be.

>
> I'm a big fan of simple things like bitmaps. I wonder if it's a good
> idea to divide the entire frame buffer into equally sized X*Y tiles
> and have a bitmap of dirty bits. A "1" in the bitmap means tile is
> dirty and needs update and a "0" means no need to update. The best
> tile size is application specific. The size of the bitmap varies of
> course with the tile size.
>
> For a 1024x768 display using 32x32 tiles we need 24 32-bit words.
> That's pretty small and simple, no?

Okay, I just realized that I neglected to mention the XDamage
extension which had a big influence on me. I think the following page:
http://www.freedesktop.org/wiki/Software/XDamage
and:
http://www.opensource.apple.com/darwinsource/Current/X11proto-15.1/damageproto/damageproto-1.1.0/damageproto.txt
explain a lot of thinking that has gone into solving similar issues.

I think the fact that Xfbdev and Xorg utilize that rectangle and
rectangle count based infrastructure would push us towards retaining
the same concepts. In my mind, Xfbdev/Xorg would be the prime
candidate for this API.

Thanks,
jaya

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-15 11:08       ` Jaya Kumar
@ 2009-01-16  3:09         ` Magnus Damm
  2009-01-16  9:24           ` Jaya Kumar
  0 siblings, 1 reply; 18+ messages in thread
From: Magnus Damm @ 2009-01-16  3:09 UTC (permalink / raw)
  To: Jaya Kumar; +Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Thu, Jan 15, 2009 at 8:08 PM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
> On Thu, Jan 15, 2009 at 5:29 AM, Magnus Damm <magnus.damm@gmail.com> wrote:
>> I agree with Tomi about the memory allocation.
>
> Yes, I agree with that too. :-) I proposed pushing that decision into
> the driver so that it could decide for itself whether it wants to
> allocate a buffer or retain a fixed structure.

Sure, letting the driver decide things depending on the type of
hardware sounds like a good plan.

>> I wonder how fine grained control that is needed. It's not an exact
>> science, right? If a slightly larger area is updated than what is
>
> Agreed that it is not a one-approach fits all scenario.
>
>> needed then we will take a performance hit, but things should work as
>> expected apart from that right?
>
> I'm not sure I understood this. Why do you say "If a large area is
> updated, then we will take a performance hit."? I think that statement
> depends on the device, right? I agree that if a lot of pixels are
> updated, then there is a lot of data to transfer, but beyond that it
> is very much dependent on the device, whether it uses DMA, what kind
> of update latency it has, what kind of partial update capability it
> has, all of which affect how much of a performance hit is taken and
> what the optimal case would be.

Sorry for my poor selection of words. I agree that it's device
dependent, but what I was trying to say is that a lossy conversion to
a larger area is ok if i've understood things correctly.

We will have correct behavior but performance degradation if the user
space program asks to update a small rectangle in the middle of the
screen but the driver or some layer in between decides to update say
the entire screen instead. Do you agree with me?

>> I'm a big fan of simple things like bitmaps. I wonder if it's a good
>> idea to divide the entire frame buffer into equally sized X*Y tiles
>> and have a bitmap of dirty bits. A "1" in the bitmap means tile is
>> dirty and needs update and a "0" means no need to update. The best
>> tile size is application specific. The size of the bitmap varies of
>> course with the tile size.
>>
>> For a 1024x768 display using 32x32 tiles we need 24 32-bit words.
>> That's pretty small and simple, no?

Just trying to pitch my idea a bit harder: The above example would
need a 96 bytes bitmap which will fit in just a few cache lines. This
arrangement of the data gives you good performance compared to
multiple allocations scattered all over the place.

Also, using a bitmap makes it at least half-easy to do a lossy OR
operation of all damage rectangles. Who is taking care of overlapping
updates otherwise - some user space library?

I'd say we would benefit from managing the OR operation within the
kernel since deferred io may collect a lot of overlapping areas over
time. Actually, we sort of do that already by touching the pages in
the deferred io mmap handling code. If we won't do any OR operation
within the kernel for deferred io, then how are we supposed to handle
long deferred io delays? Just keep on kmallocing rectangles? Or
expanding the rectangles?

Or maybe we are discussing apples and oranges? Is your damage API is
meant to force a screen update so there is no need for in-kernel OR
operation? We have a need for in-kernel OR operation with deferred io
already I think, so there is some overlap in my opinion.
.
> Okay, I just realized that I neglected to mention the XDamage
> extension which had a big influence on me. I think the following page:
> http://www.freedesktop.org/wiki/Software/XDamage
> and:
> http://www.opensource.apple.com/darwinsource/Current/X11proto-15.1/damageproto/damageproto-1.1.0/damageproto.txt
> explain a lot of thinking that has gone into solving similar issues.
>
> I think the fact that Xfbdev and Xorg utilize that rectangle and
> rectangle count based infrastructure would push us towards retaining
> the same concepts. In my mind, Xfbdev/Xorg would be the prime
> candidate for this API.

Thanks for the pointers. I'm not saying that using rectangles is a bad
thing, I just wonder if there are better data structures available for
backing the dirty screen area.

I'd say that a combination of rectangle based user space damage API
_and_ (maybe tile based) in-kernel dirty area OR operation is the best
approach. This because XDamage is rectangle based and the deferred io
delay (ie amount of time to collect dirty areas) is a kernel driver
property.

Cheers,

/ magnus

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-16  3:09         ` Magnus Damm
@ 2009-01-16  9:24           ` Jaya Kumar
  2009-01-16 11:08             ` Magnus Damm
  0 siblings, 1 reply; 18+ messages in thread
From: Jaya Kumar @ 2009-01-16  9:24 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Thu, Jan 15, 2009 at 10:09 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
> On Thu, Jan 15, 2009 at 8:08 PM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
>> On Thu, Jan 15, 2009 at 5:29 AM, Magnus Damm <magnus.damm@gmail.com> wrote:
>>> needed then we will take a performance hit, but things should work as
>>> expected apart from that right?
>>
>> I'm not sure I understood this. Why do you say "If a large area is
>> updated, then we will take a performance hit."? I think that statement
>> depends on the device, right? I agree that if a lot of pixels are
>> updated, then there is a lot of data to transfer, but beyond that it
>> is very much dependent on the device, whether it uses DMA, what kind
>> of update latency it has, what kind of partial update capability it
>> has, all of which affect how much of a performance hit is taken and
>> what the optimal case would be.
>
> Sorry for my poor selection of words. I agree that it's device
> dependent, but what I was trying to say is that a lossy conversion to
> a larger area is ok if i've understood things correctly.

I think I understand your meaning. Okay, I think I might have a
relevant example. I ran xeyes (which uses shape) on my test setup on
broadsheetfb (btw, if it is of interest, I've put a demo video clip of
this here: http://www.youtube.com/watch?v=q_mLKQXcsgY ) and if I
remember correctly it generated about 10+ damage rectangles so I
suspect that it must have actually coalesced some of the damage area
in a lossy way. Another case would be drawing a diagonal line across
the screen. How many rectangles should that generate to be optimal? If
the hardware prefers single large transfers, then it would be optimal
to just do a full screen update. If the hardware exhibits a high
penalty per pixel transferred than it would be optimal to split the
transfers in order to reduce the total pixels transferred. So to
summarize, yes, I agree with you that a lossy conversion to a larger
area is okay. I'll go further and say that I think userspace apps like
Xfbdev and vnc must be doing that in order to optimize their pixmaps
and bitcopies.

>
> We will have correct behavior but performance degradation if the user
> space program asks to update a small rectangle in the middle of the
> screen but the driver or some layer in between decides to update say
> the entire screen instead. Do you agree with me?

I agree with you. I think that's the situation that we want to avoid
happening. I think we can avoid that by providing upper layers
(userspace) with sufficient information (but kept as generic as
possible) about the capabilities of the underlying layers in order for
userspace and the kernel to optimize its behavior.

>
>>> I'm a big fan of simple things like bitmaps. I wonder if it's a good
>>> idea to divide the entire frame buffer into equally sized X*Y tiles
>>> and have a bitmap of dirty bits. A "1" in the bitmap means tile is
>>> dirty and needs update and a "0" means no need to update. The best
>>> tile size is application specific. The size of the bitmap varies of
>>> course with the tile size.
>>>
>>> For a 1024x768 display using 32x32 tiles we need 24 32-bit words.
>>> That's pretty small and simple, no?
>
> Just trying to pitch my idea a bit harder: The above example would
> need a 96 bytes bitmap which will fit in just a few cache lines. This
> arrangement of the data gives you good performance compared to
> multiple allocations scattered all over the place.

I didn't follow the implication that there has to be multiple
allocations. If we are comparing the bitmap versus rects approach,
then my comparison would be:
a) where the driver preallocated a bitmap that would be updated by a
copy from userspace (same allocation would be done in userspace)
b) where the driver preallocated a fixed number of rectangles which
would be updated by a copy from userspace (same allocation would be
done in userspace)

>
> Also, using a bitmap makes it at least half-easy to do a lossy OR
> operation of all damage rectangles. Who is taking care of overlapping
> updates otherwise - some user space library?

I may not have fully understood above. I'm not sure that overlapping
updates must be avoided for all devices. Some devices would fail if
overlapping DMAs are done, but others would have no issues there. So
we would benefit from exposing that information to userspace so that
it could ensure overlaps are resolved if the underlying hardware
requires (or benefits from) it.

From our discussion so far, I've realized that we would benefit from
providing 3 things to userspace:
a) can_overlap flag
b) alignment constraint
c) max rectangle count

>
> I'd say we would benefit from managing the OR operation within the
> kernel since deferred io may collect a lot of overlapping areas over

I think there's an assumption there. I think you've associated
deferred IO with this damage API. Although the two can be related,
they don't have to be. I agree that it will very likely be deferred IO
drivers that are likely to benefit the most from this API but they can
also be completely separate.

> time. Actually, we sort of do that already by touching the pages in
> the deferred io mmap handling code. If we won't do any OR operation

Some questions here. Help me understand the "touching the pages in the
mmap handling code" part. I do not do that in deferred IO. fb_defio
does not write a page on its own, only userspace writes a page and
then this gets mkcleaned by defio when the client driver is done. Is
that your meaning, ie: we clean the pages?

> within the kernel for deferred io, then how are we supposed to handle
> long deferred io delays? Just keep on kmallocing rectangles? Or
> expanding the rectangles?

That's a good question. Here's my thoughts. Lets say we have a display
device with 10s latency (a scenario that exists in real life). As you
correctly pointed out, it would be bad if that driver kept aggregating
rectangles, as that would consume a significant amount of resources.
In that scenario, I recommend that the driver should convert the list
of rectangles into a bitmap. It is direct to convert from a rectangle
list to a bitmap as it is a linear mathematical operation. It can then
OR that with its existing bitmap.

I believe it is a more complex operation to convert from a bitmap to a
rectangle list or DMA transfer sequence. I'm trying to sketch the
function that would coalesce a bitmap of written pages into a sequence
of dma transfers. It requires heuristics and policy in order to
coalesce optimally. It would be similar to a Karnaugh map minimization
problem. I think that kind of operation would be a better fit to do in
userspace. That would fit the needs of a userspace framebuffer client
that kept its damage list as a bitmap. (Note, I'm not aware of any
examples of the latter yet.)

>
> Or maybe we are discussing apples and oranges? Is your damage API is

I think we are thinking about the same problems and have different
approaches for the solution. That is a good thing. It makes us think
harder about the API selection and I think we all benefit. I'm open to
the ideas you've raised and they are having an impact on the code I am
writing.

> meant to force a screen update so there is no need for in-kernel OR

No, the damage API is not meant to force the driver to update the
screen. The driver can decide what to do and when.

> operation? We have a need for in-kernel OR operation with deferred io
> already I think, so there is some overlap in my opinion.

I'm not sure I've understood your full meaning when you say "in-kernel
OR operation". Could you elaborate on that?

> .
>> Okay, I just realized that I neglected to mention the XDamage
>> extension which had a big influence on me. I think the following page:
>> http://www.freedesktop.org/wiki/Software/XDamage
>> and:
>> http://www.opensource.apple.com/darwinsource/Current/X11proto-15.1/damageproto/damageproto-1.1.0/damageproto.txt
>> explain a lot of thinking that has gone into solving similar issues.
>>
>> I think the fact that Xfbdev and Xorg utilize that rectangle and
>> rectangle count based infrastructure would push us towards retaining
>> the same concepts. In my mind, Xfbdev/Xorg would be the prime
>> candidate for this API.
>
> Thanks for the pointers. I'm not saying that using rectangles is a bad
> thing, I just wonder if there are better data structures available for
> backing the dirty screen area.
>
> I'd say that a combination of rectangle based user space damage API
> _and_ (maybe tile based) in-kernel dirty area OR operation is the best
> approach. This because XDamage is rectangle based and the deferred io
> delay (ie amount of time to collect dirty areas) is a kernel driver
> property.

I understand your point. I propose this: A driver that prefers a
bitmap can provide a flag in fb_info. Our in-kernel API can then use
that to decide whether to pass the rectangle list or to  generate the
bitmap from the rectangle list and then pass that to the driver. I'm
happy to implement that as I think it is a reasonable idea and
straightforward to achieve.

Thanks,
jaya

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-16  9:24           ` Jaya Kumar
@ 2009-01-16 11:08             ` Magnus Damm
  2009-01-16 22:14               ` Jaya Kumar
  0 siblings, 1 reply; 18+ messages in thread
From: Magnus Damm @ 2009-01-16 11:08 UTC (permalink / raw)
  To: Jaya Kumar; +Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Fri, Jan 16, 2009 at 6:24 PM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
> I think I understand your meaning. Okay, I think I might have a
> relevant example. I ran xeyes (which uses shape) on my test setup on
> broadsheetfb (btw, if it is of interest, I've put a demo video clip of
> this here: http://www.youtube.com/watch?v=q_mLKQXcsgY ) and if I
> remember correctly it generated about 10+ damage rectangles so I
> suspect that it must have actually coalesced some of the damage area
> in a lossy way. Another case would be drawing a diagonal line across
> the screen. How many rectangles should that generate to be optimal? If
> the hardware prefers single large transfers, then it would be optimal
> to just do a full screen update. If the hardware exhibits a high
> penalty per pixel transferred than it would be optimal to split the
> transfers in order to reduce the total pixels transferred. So to
> summarize, yes, I agree with you that a lossy conversion to a larger
> area is okay. I'll go further and say that I think userspace apps like
> Xfbdev and vnc must be doing that in order to optimize their pixmaps
> and bitcopies.

Nice clip. =)

Right, user space applications may optimize things for us. Optimizing
to not redraw the same area twice sounds good, but if user space is
expanding the area then we may see a performance hit...

>> We will have correct behavior but performance degradation if the user
>> space program asks to update a small rectangle in the middle of the
>> screen but the driver or some layer in between decides to update say
>> the entire screen instead. Do you agree with me?
>
> I agree with you. I think that's the situation that we want to avoid
> happening. I think we can avoid that by providing upper layers
> (userspace) with sufficient information (but kept as generic as
> possible) about the capabilities of the underlying layers in order for
> userspace and the kernel to optimize its behavior.

Sure, good plan.

>>>> I'm a big fan of simple things like bitmaps. I wonder if it's a good
>>>> idea to divide the entire frame buffer into equally sized X*Y tiles
>>>> and have a bitmap of dirty bits. A "1" in the bitmap means tile is
>>>> dirty and needs update and a "0" means no need to update. The best
>>>> tile size is application specific. The size of the bitmap varies of
>>>> course with the tile size.
>>>>
>>>> For a 1024x768 display using 32x32 tiles we need 24 32-bit words.
>>>> That's pretty small and simple, no?
>>
>> Just trying to pitch my idea a bit harder: The above example would
>> need a 96 bytes bitmap which will fit in just a few cache lines. This
>> arrangement of the data gives you good performance compared to
>> multiple allocations scattered all over the place.
>
> I didn't follow the implication that there has to be multiple
> allocations. If we are comparing the bitmap versus rects approach,
> then my comparison would be:
> a) where the driver preallocated a bitmap that would be updated by a
> copy from userspace (same allocation would be done in userspace)
> b) where the driver preallocated a fixed number of rectangles which
> would be updated by a copy from userspace (same allocation would be
> done in userspace)

Sorry for not being clear enough. I meant to compare collecting
kmalloced rectangles on the damagelist vs marking dirty tiles in a
static bitmap. The interface to user space remains unchanged, just the
dirty area backing store is different.

>> Also, using a bitmap makes it at least half-easy to do a lossy OR
>> operation of all damage rectangles. Who is taking care of overlapping
>> updates otherwise - some user space library?
>
> I may not have fully understood above. I'm not sure that overlapping
> updates must be avoided for all devices. Some devices would fail if
> overlapping DMAs are done, but others would have no issues there. So
> we would benefit from exposing that information to userspace so that
> it could ensure overlaps are resolved if the underlying hardware
> requires (or benefits from) it.

I'm not sure if overlapping updates will cause any problems, I merely
thought of it as a performance optimization. If you draw the same
circle 10 times in one update we want to make sure the screen only is
updated once. User space may solve that for us already though, but I
don't think so since the deferred io is a driver property. Or have I
misunderstood?

> From our discussion so far, I've realized that we would benefit from
> providing 3 things to userspace:
> a) can_overlap flag
> b) alignment constraint
> c) max rectangle count

I'm more for letting user space select whatever max rectangle count it
wants and let the kernel code go through all rectangles and do an OR
operation on some dirty backing store data area. That way user space
can be flexible and we make sure we don't update the same area more
than once.

>> I'd say we would benefit from managing the OR operation within the
>> kernel since deferred io may collect a lot of overlapping areas over
>
> I think there's an assumption there. I think you've associated
> deferred IO with this damage API. Although the two can be related,
> they don't have to be. I agree that it will very likely be deferred IO
> drivers that are likely to benefit the most from this API but they can
> also be completely separate.

Any examples of non deferred io use cases? =)

>> time. Actually, we sort of do that already by touching the pages in
>> the deferred io mmap handling code. If we won't do any OR operation
>
> Some questions here. Help me understand the "touching the pages in the
> mmap handling code" part. I do not do that in deferred IO. fb_defio
> does not write a page on its own, only userspace writes a page and
> then this gets mkcleaned by defio when the client driver is done. Is
> that your meaning, ie: we clean the pages?

I meant how fb_deferred_io_mkwrite() + page_mkclean() work. First time
a page is touched it is put on the list, second time and after nothing
happens until the delayed work happens when we clean the page and
start over waiting for a touch again.

Looks like an OR operation to me. =) Instead of putting the page on a
list we may mark a tile dirty in a bitmap instead. The bitmap code
scales O(1) which is pretty nice. I think we could avoid the list
looping in fb_deferred_io_mkwrite() with a bitmap which would make the
code scale much better. I'm not 100% sure though.

>> within the kernel for deferred io, then how are we supposed to handle
>> long deferred io delays? Just keep on kmallocing rectangles? Or
>> expanding the rectangles?
>
> That's a good question. Here's my thoughts. Lets say we have a display
> device with 10s latency (a scenario that exists in real life). As you
> correctly pointed out, it would be bad if that driver kept aggregating
> rectangles, as that would consume a significant amount of resources.
> In that scenario, I recommend that the driver should convert the list
> of rectangles into a bitmap. It is direct to convert from a rectangle
> list to a bitmap as it is a linear mathematical operation. It can then
> OR that with its existing bitmap.

So why not doing that directly instead of keeping your pages / dirty
rectangles on a list? =)

> I believe it is a more complex operation to convert from a bitmap to a
> rectangle list or DMA transfer sequence. I'm trying to sketch the
> function that would coalesce a bitmap of written pages into a sequence
> of dma transfers. It requires heuristics and policy in order to
> coalesce optimally. It would be similar to a Karnaugh map minimization
> problem. I think that kind of operation would be a better fit to do in
> userspace. That would fit the needs of a userspace framebuffer client
> that kept its damage list as a bitmap. (Note, I'm not aware of any
> examples of the latter yet.)

I agree that this is the tricky part, but I'm not sure if it is so
complex that it has to be done in user space. Remember my patch
related to fillrect/copyarea/imageblit and deferred io? They would
benefit from filling in the dirty bitmap as well - but not in user
space. =)

I'm not sure about the best way to convert the bitmap to a sequence of
DMA requests. I propose transferring tile by tile and letting displays
with low bandwidth use a small tile size. Displays with high bandwidth
and high setup cost can use larger tile size.

>> Or maybe we are discussing apples and oranges? Is your damage API is
>
> I think we are thinking about the same problems and have different
> approaches for the solution. That is a good thing. It makes us think
> harder about the API selection and I think we all benefit. I'm open to
> the ideas you've raised and they are having an impact on the code I am
> writing.

I think so too.

>> meant to force a screen update so there is no need for in-kernel OR
>
> No, the damage API is not meant to force the driver to update the
> screen. The driver can decide what to do and when.
>
>> operation? We have a need for in-kernel OR operation with deferred io
>> already I think, so there is some overlap in my opinion.
>
> I'm not sure I've understood your full meaning when you say "in-kernel
> OR operation". Could you elaborate on that?

The fillrect/copyarea/imageblit may want to hook into the dirty area bitmap.

>> I'd say that a combination of rectangle based user space damage API
>> _and_ (maybe tile based) in-kernel dirty area OR operation is the best
>> approach. This because XDamage is rectangle based and the deferred io
>> delay (ie amount of time to collect dirty areas) is a kernel driver
>> property.it
>
> I understand your point. I propose this: A driver that prefers a
> bitmap can provide a flag in fb_info. Our in-kernel API can then use
> that to decide whether to pass the rectangle list or to  generate the
> bitmap from the rectangle list and then pass that to the driver. I'm
> happy to implement that as I think it is a reasonable idea and
> straightforward to achieve.

That's ok, but I'm fine with just a rect user space interface. The
kernel fbdev interface is fine too, but I think it would be
interesting to work on handling the dirty information inside the
kernel more efficiently.

Thanks for your comments. Have a good weekend!

/ magnus

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-16 11:08             ` Magnus Damm
@ 2009-01-16 22:14               ` Jaya Kumar
  2009-01-19  4:44                 ` Magnus Damm
  2009-01-19 12:59                 ` Tomi Valkeinen
  0 siblings, 2 replies; 18+ messages in thread
From: Jaya Kumar @ 2009-01-16 22:14 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Fri, Jan 16, 2009 at 7:08 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
> On Fri, Jan 16, 2009 at 6:24 PM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
>> I think I understand your meaning. Okay, I think I might have a
>> relevant example. I ran xeyes (which uses shape) on my test setup on
>> broadsheetfb (btw, if it is of interest, I've put a demo video clip of
>> this here: http://www.youtube.com/watch?v=q_mLKQXcsgY ) and if I
>> remember correctly it generated about 10+ damage rectangles so I
>> suspect that it must have actually coalesced some of the damage area
>> in a lossy way. Another case would be drawing a diagonal line across
>> the screen. How many rectangles should that generate to be optimal? If
>> the hardware prefers single large transfers, then it would be optimal
>> to just do a full screen update. If the hardware exhibits a high
>> penalty per pixel transferred than it would be optimal to split the
>> transfers in order to reduce the total pixels transferred. So to
>> summarize, yes, I agree with you that a lossy conversion to a larger
>> area is okay. I'll go further and say that I think userspace apps like
>> Xfbdev and vnc must be doing that in order to optimize their pixmaps
>> and bitcopies.
>
> Nice clip. =)
>
> Right, user space applications may optimize things for us. Optimizing
> to not redraw the same area twice sounds good, but if user space is
> expanding the area then we may see a performance hit...

In general, I agree. I would expect userspace to ensure that it
doesn't give us duplicate regions, subset regions, or overlapped
regions (as you raised before). If they do, I see that as a problem
similar to filesystems where an application misbehaves by doing
seek/write the same thing repeatedly. Now, you mentioned if userspace
expands the area, we may see a performance hit. Yes, I think I agree.
To be more elaborate about this, I would raise the issue of drawing a
diagonal line across the entire screen. Userspace has a decision to
make whether it sends us one big rectangle to represent the whole
screen or whether it breaks that up into multiple rectangular blocks.
In real life, I think this one is non-optimally but simply handled by
saying hardware supports max 10 rectangles at at time, so just break
up the diagonal write to 10 rectangles.

>
>>> We will have correct behavior but performance degradation if the user
>>> space program asks to update a small rectangle in the middle of the
>>> screen but the driver or some layer in between decides to update say
>>> the entire screen instead. Do you agree with me?
>>
>> I agree with you. I think that's the situation that we want to avoid
>> happening. I think we can avoid that by providing upper layers
>> (userspace) with sufficient information (but kept as generic as
>> possible) about the capabilities of the underlying layers in order for
>> userspace and the kernel to optimize its behavior.
>
> Sure, good plan.
>
>>>>> I'm a big fan of simple things like bitmaps. I wonder if it's a good
>>>>> idea to divide the entire frame buffer into equally sized X*Y tiles
>>>>> and have a bitmap of dirty bits. A "1" in the bitmap means tile is
>>>>> dirty and needs update and a "0" means no need to update. The best
>>>>> tile size is application specific. The size of the bitmap varies of
>>>>> course with the tile size.
>>>>>
>>>>> For a 1024x768 display using 32x32 tiles we need 24 32-bit words.
>>>>> That's pretty small and simple, no?
>>>
>>> Just trying to pitch my idea a bit harder: The above example would
>>> need a 96 bytes bitmap which will fit in just a few cache lines. This
>>> arrangement of the data gives you good performance compared to
>>> multiple allocations scattered all over the place.
>>
>> I didn't follow the implication that there has to be multiple
>> allocations. If we are comparing the bitmap versus rects approach,
>> then my comparison would be:
>> a) where the driver preallocated a bitmap that would be updated by a
>> copy from userspace (same allocation would be done in userspace)
>> b) where the driver preallocated a fixed number of rectangles which
>> would be updated by a copy from userspace (same allocation would be
>> done in userspace)
>
> Sorry for not being clear enough. I meant to compare collecting
> kmalloced rectangles on the damagelist vs marking dirty tiles in a
> static bitmap. The interface to user space remains unchanged, just the
> dirty area backing store is different.

Okay, understood. No disagreement here. If driver aggregates rects via
kmalloc and has the discussed characteristics, its resource
utilization versus bitmap approach will be poor.

>
>>> Also, using a bitmap makes it at least half-easy to do a lossy OR
>>> operation of all damage rectangles. Who is taking care of overlapping
>>> updates otherwise - some user space library?
>>
>> I may not have fully understood above. I'm not sure that overlapping
>> updates must be avoided for all devices. Some devices would fail if
>> overlapping DMAs are done, but others would have no issues there. So
>> we would benefit from exposing that information to userspace so that
>> it could ensure overlaps are resolved if the underlying hardware
>> requires (or benefits from) it.
>
> I'm not sure if overlapping updates will cause any problems, I merely
> thought of it as a performance optimization. If you draw the same
> circle 10 times in one update we want to make sure the screen only is
> updated once. User space may solve that for us already though, but I
> don't think so since the deferred io is a driver property. Or have I
> misunderstood?

I now see your point about overlaps. You are right that userspace does
not necessarily solve the problem for us. If they give us duplicate
rects or subset rects or overlapping rects, then these are all
immediately negative for performance. Further, if we are aggregating
rects and duplicates/subset/overlaps occur due to the aggregation,
then this is also negative for performance. I think we'll need to add
basic support functions to do checks and corrections for these
scenarios.

About the deferred IO part, okay, let me come back to that below.

>
>> From our discussion so far, I've realized that we would benefit from
>> providing 3 things to userspace:
>> a) can_overlap flag
>> b) alignment constraint
>> c) max rectangle count
>
> I'm more for letting user space select whatever max rectangle count it
> wants and let the kernel code go through all rectangles and do an OR
> operation on some dirty backing store data area. That way user space
> can be flexible and we make sure we don't update the same area more
> than once.

Okay, lets discuss that a bit more. I mean that the driver reports
back to userspace via GETDAMAGE a value for its preferred rectangle
count (call that max rectangle count). Userspace may choose to ignore
the max (it may not even if picked up that data via GETDAMAGE) and
send 100 rects. The driver can choose whether to -EINVAL or it can
choose to go through the rects and perform optimization based on its
preferred structure as you suggested.

>
>>> I'd say we would benefit from managing the OR operation within the
>>> kernel since deferred io may collect a lot of overlapping areas over
>>
>> I think there's an assumption there. I think you've associated
>> deferred IO with this damage API. Although the two can be related,
>> they don't have to be. I agree that it will very likely be deferred IO
>> drivers that are likely to benefit the most from this API but they can
>> also be completely separate.
>
> Any examples of non deferred io use cases? =)

Yes, I'm glad you asked. The first one that came to mind is the NO-MMU
case. As you know, defio is MMU only today and I have no hopes of
removing that. I had damage in mind especially for these NO-MMU cases
(btw, if any vendor of such devices/cpus/boards is reading, please
drop me a mail, i would like to help support this ).

Okay, so the above was the easy answer. There are also others I have
in mind but it is debatable whether they should use damage API or
whether they should use deferred IO. I would like to discuss the range
of scenarios here:

a) Tomi raised omapfb at the start of this thread. He or she mentioned:
OMAPFB_UPDATE_WINDOW
I looked thru the code and saw:

+static int omapfb_update_window(struct fb_info *fbi,
+               u32 x, u32 y, u32 w, u32 h)

[ btw, interesting to see use of u32 above, why not just u16? ]

I noticed dsi_update_screen_dispc. After reading this code, I formed
the following conclusion:
- this is to support the use of externally buffered displays. that is,
there is an external sdram being handled by a separate controller,
probably a MIPI-DSI controller
- basically omapfb wants to know exactly what and when stuff is
written from userspace because it has to push that manually through
the MIPI-DSI interface

That driver currently uses a private ioctl to achieve that through the
transfer of a single rectangle from userspace. It could, I believe,
achieve the same effect using deferred IO since it has an MMU but lets
leave that to one side for now. This kind of driver would be able to
use the damage API with little change. They would add a GETDAMAGE
handler that reports back their max rectangles (1) and then a
PUTDAMAGE handler that does what they already do today.

b) non-snooping LCDCs with external ram
I have seen SoCs where the LCD controller is not aware of memory
writes on the host memory bus. As a result, it doesn't actually know
when the framebuffer has been modified and it most cases it can't
benefit from that anyway due to buffering constraints. It just
repetitively DMAs from host memory to its input fifo (line buffer)
that then gets palettized/dithered/etc before hitting the display
output buffer which backs the output pins. I believe pxafb is an
example of this, you'll notice it has code to setup dma period
according to the pixel clock.

Now, if it talks directly to a standard LCD, then there's no benefit
it can gain from damage or deferred IO as it always has to perform
that DMA anyway. But in some scenarios, it is interfaced to an
external controller that has its own sdram (so that the host cpu can
be completely suspended and still have a display showing content ) in
which scenario it would benefit from being able to choose between:
i) reduce or tune its dma rate
ii) issue a more specific dma update
iii) issue dma-s only when needed
This could be achieved using either damage or defio with tradeoffs
between either approach.

>
>>> time. Actually, we sort of do that already by touching the pages in
>>> the deferred io mmap handling code. If we won't do any OR operation
>>
>> Some questions here. Help me understand the "touching the pages in the
>> mmap handling code" part. I do not do that in deferred IO. fb_defio
>> does not write a page on its own, only userspace writes a page and
>> then this gets mkcleaned by defio when the client driver is done. Is
>> that your meaning, ie: we clean the pages?
>
> I meant how fb_deferred_io_mkwrite() + page_mkclean() work. First time
> a page is touched it is put on the list, second time and after nothing
> happens until the delayed work happens when we clean the page and
> start over waiting for a touch again.
>
> Looks like an OR operation to me. =) Instead of putting the page on a
> list we may mark a tile dirty in a bitmap instead. The bitmap code
> scales O(1) which is pretty nice. I think we could avoid the list

Agreed.

> looping in fb_deferred_io_mkwrite() with a bitmap which would make the
> code scale much better. I'm not 100% sure though.

This was raised here too by Tony:
http://marc.info/?l=linux-fbdev-devel&m=117230487100960&w=2
Eventually, I had written a patch to do bitmap instead of pagelist but
never finished changing the drivers. I think it definitely performed
better in the defio case on hecubafb and metronomefb. I would claim
that intuitively it:
- completely removes list overhead and loop overhead on the defio side
- drivers still have to loop through the bitmap of course but it is
faster compared to the list iteration

performance naturally varies based on typical application behavior. If
they're updating lots of pages, then bitmap wins, if they're updating
a few, then not such a big win. But I agree it is a better approach. I
will be happy to resurrect this patch and switch defio to a bitmap as
you suggest. But I will need assistance converting all the drivers. I
can do hecubafb, metronomefb and broadsheetfb but am a bit wary of
updating the others.

>
>>> within the kernel for deferred io, then how are we supposed to handle
>>> long deferred io delays? Just keep on kmallocing rectangles? Or
>>> expanding the rectangles?
>>
>> That's a good question. Here's my thoughts. Lets say we have a display
>> device with 10s latency (a scenario that exists in real life). As you
>> correctly pointed out, it would be bad if that driver kept aggregating
>> rectangles, as that would consume a significant amount of resources.
>> In that scenario, I recommend that the driver should convert the list
>> of rectangles into a bitmap. It is direct to convert from a rectangle
>> list to a bitmap as it is a linear mathematical operation. It can then
>> OR that with its existing bitmap.
>
> So why not doing that directly instead of keeping your pages / dirty
> rectangles on a list? =)

Okay, that's a fair question. In the above case, I would adjust my
previous answer a bit. The driver could use a bitmap to detect
overlaps/subsets and then handle them suitably but retain a fixed
pre-allocated rect list so that it can schedule its dma (or other
mechanism) transfers normally. You are right that it could instead
only keep the bitmap and then generate the dma transfer list from the
bitmap but I worry about the complexity and ability to get good
results there.

>
>> I believe it is a more complex operation to convert from a bitmap to a
>> rectangle list or DMA transfer sequence. I'm trying to sketch the
>> function that would coalesce a bitmap of written pages into a sequence
>> of dma transfers. It requires heuristics and policy in order to
>> coalesce optimally. It would be similar to a Karnaugh map minimization
>> problem. I think that kind of operation would be a better fit to do in
>> userspace. That would fit the needs of a userspace framebuffer client
>> that kept its damage list as a bitmap. (Note, I'm not aware of any
>> examples of the latter yet.)
>
> I agree that this is the tricky part, but I'm not sure if it is so
> complex that it has to be done in user space. Remember my patch
> related to fillrect/copyarea/imageblit and deferred io? They would
> benefit from filling in the dirty bitmap as well - but not in user
> space. =)

Good point. Okay, will think about this some more.

>
> I'm not sure about the best way to convert the bitmap to a sequence of
> DMA requests. I propose transferring tile by tile and letting displays
> with low bandwidth use a small tile size. Displays with high bandwidth
> and high setup cost can use larger tile size.

I'm also not sure. I guess we should look at the use cases and see
what's desirable.

>
>>> Or maybe we are discussing apples and oranges? Is your damage API is
>>
>> I think we are thinking about the same problems and have different
>> approaches for the solution. That is a good thing. It makes us think
>> harder about the API selection and I think we all benefit. I'm open to
>> the ideas you've raised and they are having an impact on the code I am
>> writing.
>
> I think so too.
>
>>> meant to force a screen update so there is no need for in-kernel OR
>>
>> No, the damage API is not meant to force the driver to update the
>> screen. The driver can decide what to do and when.
>>
>>> operation? We have a need for in-kernel OR operation with deferred io
>>> already I think, so there is some overlap in my opinion.
>>
>> I'm not sure I've understood your full meaning when you say "in-kernel
>> OR operation". Could you elaborate on that?
>
> The fillrect/copyarea/imageblit may want to hook into the dirty area bitmap.

Yup, I agree that can be beneficial.

>
>>> I'd say that a combination of rectangle based user space damage API
>>> _and_ (maybe tile based) in-kernel dirty area OR operation is the best
>>> approach. This because XDamage is rectangle based and the deferred io
>>> delay (ie amount of time to collect dirty areas) is a kernel driver
>>> property.it
>>
>> I understand your point. I propose this: A driver that prefers a
>> bitmap can provide a flag in fb_info. Our in-kernel API can then use
>> that to decide whether to pass the rectangle list or to  generate the
>> bitmap from the rectangle list and then pass that to the driver. I'm
>> happy to implement that as I think it is a reasonable idea and
>> straightforward to achieve.
>
> That's ok, but I'm fine with just a rect user space interface. The
> kernel fbdev interface is fine too, but I think it would be
> interesting to work on handling the dirty information inside the
> kernel more efficiently.

Ok, understood. Will think about this some more.

>
> Thanks for your comments. Have a good weekend!
>

Thanks, u2,
jaya

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-16 22:14               ` Jaya Kumar
@ 2009-01-19  4:44                 ` Magnus Damm
  2009-01-19 15:15                   ` Jaya Kumar
  2009-01-19 12:59                 ` Tomi Valkeinen
  1 sibling, 1 reply; 18+ messages in thread
From: Magnus Damm @ 2009-01-19  4:44 UTC (permalink / raw)
  To: Jaya Kumar; +Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Sat, Jan 17, 2009 at 7:14 AM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
> On Fri, Jan 16, 2009 at 7:08 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
>> Right, user space applications may optimize things for us. Optimizing
>> to not redraw the same area twice sounds good, but if user space is
>> expanding the area then we may see a performance hit...
>
> In general, I agree. I would expect userspace to ensure that it
> doesn't give us duplicate regions, subset regions, or overlapped
> regions (as you raised before). If they do, I see that as a problem
> similar to filesystems where an application misbehaves by doing
> seek/write the same thing repeatedly. Now, you mentioned if userspace
> expands the area, we may see a performance hit. Yes, I think I agree.
> To be more elaborate about this, I would raise the issue of drawing a
> diagonal line across the entire screen. Userspace has a decision to
> make whether it sends us one big rectangle to represent the whole
> screen or whether it breaks that up into multiple rectangular blocks.
> In real life, I think this one is non-optimally but simply handled by
> saying hardware supports max 10 rectangles at at time, so just break
> up the diagonal write to 10 rectangles.

So say that use space plays nice and breaks it up into 10 rectangles.
That sounds easy for the generic case, but if this framebuffer is
using deferred io, then how are dirty pages handled? All of a sudden
you may have 11 rectangles.

Also sorry for being a bit slow, but I don't understand how the damage
call works together with deferrec io and fsync(). Today fsync flushes
dirty pages to the display. With damage, both dirty pages _and_ damage
rectangles are flushed? Or does the damage information replace the
dirty pages?

As for the diagonal line, i like your example. Applying this to the
dirty tile bitmap, having one bit per pixel would be the most accurate
representation, but larger tile size is most likely more efficient. =)

Your damage interface is exporting the maximum rectangle count to user
space and letting it do it's best to work efficiently with the
hardware. I think that sounds straight forward and simple. But is it
enough information?

What if we would let user space describe the dirty data as accurate as
possible instead? Then let the kernel take this information (and
information from other sources) and feed that to the graphics hardware
somehow. Exactly how is a bit tricky - maybe too difficult. I'm not
sure.

I guess the main question is how the user space interface should look
like. Should it export hardware capabilities?

>> I'm not sure if overlapping updates will cause any problems, I merely
>> thought of it as a performance optimization. If you draw the same
>> circle 10 times in one update we want to make sure the screen only is
>> updated once. User space may solve that for us already though, but I
>> don't think so since the deferred io is a driver property. Or have I
>> misunderstood?
>
> I now see your point about overlaps. You are right that userspace does
> not necessarily solve the problem for us. If they give us duplicate
> rects or subset rects or overlapping rects, then these are all
> immediately negative for performance. Further, if we are aggregating
> rects and duplicates/subset/overlaps occur due to the aggregation,
> then this is also negative for performance. I think we'll need to add
> basic support functions to do checks and corrections for these
> scenarios.

We could check and correct, or we could aggregate all rects from
different sources.

> About the deferred IO part, okay, let me come back to that below.

Yeah, this is the tricky part in my opinion. =)

>>
>>> From our discussion so far, I've realized that we would benefit from
>>> providing 3 things to userspace:
>>> a) can_overlap flag
>>> b) alignment constraint
>>> c) max rectangle count
>>
>> I'm more for letting user space select whatever max rectangle count it
>> wants and let the kernel code go through all rectangles and do an OR
>> operation on some dirty backing store data area. That way user space
>> can be flexible and we make sure we don't update the same area more
>> than once.
>
> Okay, lets discuss that a bit more. I mean that the driver reports
> back to userspace via GETDAMAGE a value for its preferred rectangle
> count (call that max rectangle count). Userspace may choose to ignore
> the max (it may not even if picked up that data via GETDAMAGE) and
> send 100 rects. The driver can choose whether to -EINVAL or it can
> choose to go through the rects and perform optimization based on its
> preferred structure as you suggested.

I understand. But how about hardware that only supports a single
rectangle within one DMA operation? I have some here in front of me.
=)

So the user space code can get 1 as rectangle count, but does that
really mean that we want user space to redraw everything if a diagonal
line is drawn across the screen? It may be better to break it up into
two separate DMA operations instead of one single one. And how do we
tell user space about that? By using 2 as rectangle count? =)

Doesn't all this just boil down to max number of rectangles,
throughput and setup cost for a dma transaction?

>>> I think there's an assumption there. I think you've associated
>>> deferred IO with this damage API. Although the two can be related,
>>> they don't have to be. I agree that it will very likely be deferred IO
>>> drivers that are likely to benefit the most from this API but they can
>>> also be completely separate.
>>
>> Any examples of non deferred io use cases? =)
>
> Yes, I'm glad you asked. The first one that came to mind is the NO-MMU
> case. As you know, defio is MMU only today and I have no hopes of
> removing that. I had damage in mind especially for these NO-MMU cases
> (btw, if any vendor of such devices/cpus/boards is reading, please
> drop me a mail, i would like to help support this ).

Yeah, I may actually have such a SuperH dev board in the office. I
think one of our SH2A boards comes with a display.

> Okay, so the above was the easy answer. There are also others I have
> in mind but it is debatable whether they should use damage API or
> whether they should use deferred IO. I would like to discuss the range
> of scenarios here:
>
> a) Tomi raised omapfb at the start of this thread. He or she mentioned:
> OMAPFB_UPDATE_WINDOW
> I looked thru the code and saw:
>
> +static int omapfb_update_window(struct fb_info *fbi,
> +               u32 x, u32 y, u32 w, u32 h)
>
> [ btw, interesting to see use of u32 above, why not just u16? ]
>
> I noticed dsi_update_screen_dispc. After reading this code, I formed
> the following conclusion:
> - this is to support the use of externally buffered displays. that is,
> there is an external sdram being handled by a separate controller,
> probably a MIPI-DSI controller
> - basically omapfb wants to know exactly what and when stuff is
> written from userspace because it has to push that manually through
> the MIPI-DSI interface
>
> That driver currently uses a private ioctl to achieve that through the
> transfer of a single rectangle from userspace. It could, I believe,
> achieve the same effect using deferred IO since it has an MMU but lets
> leave that to one side for now. This kind of driver would be able to
> use the damage API with little change. They would add a GETDAMAGE
> handler that reports back their max rectangles (1) and then a
> PUTDAMAGE handler that does what they already do today.

I understand and agree. I guess the reason for not using deferred io
is that we don't really get any good rectangles out of deferred io
today since one page covers multiple lines. This is the reason why I
think it's good to also have per-tile dirty bits instead of just
relying on the page bits to store dirty damage data.

> b) non-snooping LCDCs with external ram
> I have seen SoCs where the LCD controller is not aware of memory
> writes on the host memory bus. As a result, it doesn't actually know
> when the framebuffer has been modified and it most cases it can't
> benefit from that anyway due to buffering constraints. It just
> repetitively DMAs from host memory to its input fifo (line buffer)
> that then gets palettized/dithered/etc before hitting the display
> output buffer which backs the output pins. I believe pxafb is an
> example of this, you'll notice it has code to setup dma period
> according to the pixel clock.
>
> Now, if it talks directly to a standard LCD, then there's no benefit
> it can gain from damage or deferred IO as it always has to perform
> that DMA anyway. But in some scenarios, it is interfaced to an
> external controller that has its own sdram (so that the host cpu can
> be completely suspended and still have a display showing content ) in
> which scenario it would benefit from being able to choose between:
> i) reduce or tune its dma rate
> ii) issue a more specific dma update
> iii) issue dma-s only when needed
> This could be achieved using either damage or defio with tradeoffs
> between either approach.

This is exactly why I implemented deferred io for the SuperH LCDC
hardware in SYS mode. It's partially implemented now though - we feed
full frame data to the external controller only when needed. Before we
fed full frames regardless of if the screen had been modified or not.
Future work includes partial screen update, but it may be difficult to
implement that and still have flicker free video playback...

There is also vidix code in mplayer (sh_veu vidix driver) that does
dma straight to the framebuffer. It bypasses the deferred io handling
and it needs to do fsync after updating each frame to make sure the
screen gets updated. Using the damage api instead would be better if
only part of the screen is modified.

>> So why not doing that directly instead of keeping your pages / dirty
>> rectangles on a list? =)
>
> Okay, that's a fair question. In the above case, I would adjust my
> previous answer a bit. The driver could use a bitmap to detect
> overlaps/subsets and then handle them suitably but retain a fixed
> pre-allocated rect list so that it can schedule its dma (or other
> mechanism) transfers normally. You are right that it could instead
> only keep the bitmap and then generate the dma transfer list from the
> bitmap but I worry about the complexity and ability to get good
> results there.

Yeah, I understand. I'm not sure which is the best solution when it
comes to this. Exporting maximum rectangle count to user space seems
easy, but I wonder if it is enough information to let user space make
intelligent decisions.

Cheers,

/ magnus

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-19  4:44                 ` Magnus Damm
@ 2009-01-19 15:15                   ` Jaya Kumar
  2009-01-20  4:17                     ` Magnus Damm
  0 siblings, 1 reply; 18+ messages in thread
From: Jaya Kumar @ 2009-01-19 15:15 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Mon, Jan 19, 2009 at 12:44 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
>

Wow, I think there's a lot to discuss and its late so I'll focus on
one question for now.

>
> I guess the main question is how the user space interface should look
> like. Should it export hardware capabilities?
>

I agree that above is the key question for now. What is the best way
for userspace to expose this information to the kernel? Two approaches
have been proposed which are that userspace does:

a) provide a tile based bitmap with bits set for modified tiles.
driver will provide information about expected tile size.
b) provide an array of rectangles. driver will provide information
about preferred rectangle count, preferred alignment. i took out
overlap (since i think it is always preferable to not have any
overlapping rectangles)

Okay, since I have been backing approach b up till now, I will try to
switch positions and defend a. The main benefit I see of point a is
that it is always a fixed amount of memory to represent the updated
pages. The other is that it would be fairly easy for this to hook into
the deferred IO pagemap/tilemap approach. Are there other benefits?
What are the weaknesses? Okay, I need to reread your mails and will
try to summarize this tomorrow. Then I will do same for approach b.

Thanks,
jaya

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-19 15:15                   ` Jaya Kumar
@ 2009-01-20  4:17                     ` Magnus Damm
  2009-01-20  4:21                       ` Mikhail Gusarov
  0 siblings, 1 reply; 18+ messages in thread
From: Magnus Damm @ 2009-01-20  4:17 UTC (permalink / raw)
  To: Jaya Kumar; +Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

Hi Jaya!

On Tue, Jan 20, 2009 at 12:15 AM, Jaya Kumar <jayakumar.lkml@gmail.com> wrote:
> On Mon, Jan 19, 2009 at 12:44 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
>>
>
> Wow, I think there's a lot to discuss and its late so I'll focus on
> one question for now.

Yeah. =)

>> I guess the main question is how the user space interface should look
>> like. Should it export hardware capabilities?
>>
>
> I agree that above is the key question for now. What is the best way
> for userspace to expose this information to the kernel? Two approaches
> have been proposed which are that userspace does:
>
> a) provide a tile based bitmap with bits set for modified tiles.
> driver will provide information about expected tile size.

Just to make sure we are on the same page: We could let user space
provide the bitmap for us - that may be interesting - but I think it's
good enough to keep your array of rectangles as interface. It's clean
and simple. The tile bitmap can be handled internally, so each damage
call with N rectangles gets all the rectangles applied to the tile
bitmap. This over and over until the frame gets updated and the tile
bitmap gets cleared. In this case there is no maximum rectangle count
provided to user space.

> b) provide an array of rectangles. driver will provide information
> about preferred rectangle count, preferred alignment. i took out
> overlap (since i think it is always preferable to not have any
> overlapping rectangles)
>
> Okay, since I have been backing approach b up till now, I will try to
> switch positions and defend a. The main benefit I see of point a is
> that it is always a fixed amount of memory to represent the updated
> pages. The other is that it would be fairly easy for this to hook into
> the deferred IO pagemap/tilemap approach. Are there other benefits?
> What are the weaknesses? Okay, I need to reread your mails and will
> try to summarize this tomorrow. Then I will do same for approach b.

The weakness IMO for a is that we're not clear how to transform the
tile bitmap data into DMA requests. Also, if passing the bitmap from
user space then copying an entire bitmap may be heavy on a big screen
if the tile size is small enough.

Regarding b, I'm not sure if maximum rectangle count is enough
information to allow user space to make smart decisions for a wide
range of hardware. And how this will work together with for instance
deferred io is a bit unclear to me.

Cheers,

/ magnus

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-20  4:17                     ` Magnus Damm
@ 2009-01-20  4:21                       ` Mikhail Gusarov
  2009-01-20  4:34                         ` Magnus Damm
  0 siblings, 1 reply; 18+ messages in thread
From: Mikhail Gusarov @ 2009-01-20  4:21 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven,
	Jaya Kumar


[-- Attachment #1.1: Type: text/plain, Size: 402 bytes --]


Twas brillig at 13:17:53 20.01.2009 UTC+09 when magnus.damm@gmail.com did gyre and gimble:

 MD> but I think it's good enough to keep your array of rectangles as
 MD> interface. It's clean and simple.

It also matches some hardware's interfaces -- there are e-ink
controllers which can update specified rectangles on screen, so damage
bitmap is not needed for such controllers at all.

-- 

[-- Attachment #1.2: Type: application/pgp-signature, Size: 196 bytes --]

[-- Attachment #2: Type: text/plain, Size: 209 bytes --]

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

[-- Attachment #3: Type: text/plain, Size: 182 bytes --]

_______________________________________________
Linux-fbdev-devel mailing list
Linux-fbdev-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-fbdev-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-20  4:21                       ` Mikhail Gusarov
@ 2009-01-20  4:34                         ` Magnus Damm
  2009-01-20 10:22                           ` Michal Suchanek
  2009-01-22 21:51                           ` Jaya Kumar
  0 siblings, 2 replies; 18+ messages in thread
From: Magnus Damm @ 2009-01-20  4:34 UTC (permalink / raw)
  To: Mikhail Gusarov
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven,
	Jaya Kumar

On Tue, Jan 20, 2009 at 1:21 PM, Mikhail Gusarov
<dottedmag@dottedmag.net> wrote:
>
> Twas brillig at 13:17:53 20.01.2009 UTC+09 when magnus.damm@gmail.com did gyre and gimble:
>
>  MD> but I think it's good enough to keep your array of rectangles as
>  MD> interface. It's clean and simple.
>
> It also matches some hardware's interfaces -- there are e-ink
> controllers which can update specified rectangles on screen, so damage
> bitmap is not needed for such controllers at all.

Well, needed or not probably depends on what the aim for all this is. =)

I'm not against rectangles or the damage interface at all, I'm just
wondering how we're supposed to tie it all together. In a frame buffer
driver we receive data from multiple interfaces:

1) read/write
2) fillrect/copyarea/imageblit
3) deferred io mmap pages
4) damage api

I'm wondering if we can tie in 1->4 using bitmaps and generate dma
requests from that. I guess everyone else talks about 4 only. =)

Cheers,

/ magnus

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-20  4:34                         ` Magnus Damm
@ 2009-01-20 10:22                           ` Michal Suchanek
  2009-01-22 21:51                           ` Jaya Kumar
  1 sibling, 0 replies; 18+ messages in thread
From: Michal Suchanek @ 2009-01-20 10:22 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, Jaya Kumar, armbru, lethal,
	Geert Uytterhoeven

2009/1/20 Magnus Damm <magnus.damm@gmail.com>:
> On Tue, Jan 20, 2009 at 1:21 PM, Mikhail Gusarov
> <dottedmag@dottedmag.net> wrote:
>>
>> Twas brillig at 13:17:53 20.01.2009 UTC+09 when magnus.damm@gmail.com did gyre and gimble:
>>
>>  MD> but I think it's good enough to keep your array of rectangles as
>>  MD> interface. It's clean and simple.
>>
>> It also matches some hardware's interfaces -- there are e-ink
>> controllers which can update specified rectangles on screen, so damage
>> bitmap is not needed for such controllers at all.
>
> Well, needed or not probably depends on what the aim for all this is. =)
>
> I'm not against rectangles or the damage interface at all, I'm just
> wondering how we're supposed to tie it all together. In a frame buffer
> driver we receive data from multiple interfaces:
>
> 1) read/write
> 2) fillrect/copyarea/imageblit
> 3) deferred io mmap pages
> 4) damage api
>
> I'm wondering if we can tie in 1->4 using bitmaps and generate dma
> requests from that. I guess everyone else talks about 4 only. =)
>

Perhaps it would not be unreasonable to provide an userspace tool such
as a library or a daemon that keeps track of the framebuffer state and
tells each driver exactly what it is supposed to do, exactly in the
way which is suitable for that particular hardware.

Since this is about creating a new interface which would be used only
on a specialized hardware I do not think introducing an userspace
component would be a problem.

Thanks

Michal

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-20  4:34                         ` Magnus Damm
  2009-01-20 10:22                           ` Michal Suchanek
@ 2009-01-22 21:51                           ` Jaya Kumar
  1 sibling, 0 replies; 18+ messages in thread
From: Jaya Kumar @ 2009-01-22 21:51 UTC (permalink / raw)
  To: Magnus Damm
  Cc: linux-fbdev-devel, adaplas, armbru, lethal, Geert Uytterhoeven

On Tue, Jan 20, 2009 at 12:34 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
> I'm not against rectangles or the damage interface at all, I'm just
> wondering how we're supposed to tie it all together. In a frame buffer

Yup, no problems. I understand your position and agree that the issue
of combining everything cleanly is a question we haven't yet answered.

> driver we receive data from multiple interfaces:
>
> 1) read/write
> 2) fillrect/copyarea/imageblit
> 3) deferred io mmap pages
> 4) damage api
>

Ok, this is an interesting one to discuss. In the broadsheetfb damage
patch, currently write() and defio-mmap are treated together as they
both deliver via the pagemap. [ write() could achieve better
granularity than defio-mmap if we delivered via rects or a tilemap. ]
damage is handled separately where it delivers to an accumulating
rectlist. Then at actual transfer time (induced via defio), I check
whether we a) have damage info or b) just pagemap. If damage is
available then I treat it as the more accurate source and discard all
defio pagemap info.

This works fine for broadsheetfb because I haven't tried or really
needed to support multiple simultaneous fbdev clients. That is, I just
run X and X11 apps. I don't run X and another fbdev client, say fbcon
or say mplayer-fbdev, simultaneously or within a single defio
interval. At any one point in time, I only expect there to be one
source of data so picking the most accurate source and discarding the
others seemed reasonable.

I can see that this assumption may not hold true for the general case.
A basic case would be a user switching from X to mplayer-fbdev and
then back within a single defio interval. In that scenario, we would
receive separate sources of data. I believe it would be possible to
address this issue. I'll try to sketch some code to handle this soon.

> I'm wondering if we can tie in 1->4 using bitmaps and generate dma
> requests from that. I guess everyone else talks about 4 only. =)
>

That's what I'm wondering about too and yes, previously I had only
considered 4, but now I'll think about the bigger picture as you
suggested.

Thanks,
jaya

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 2.6.28 1/2] fbdev: add ability to set damage
  2009-01-16 22:14               ` Jaya Kumar
  2009-01-19  4:44                 ` Magnus Damm
@ 2009-01-19 12:59                 ` Tomi Valkeinen
  1 sibling, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2009-01-19 12:59 UTC (permalink / raw)
  To: ext Jaya Kumar
  Cc: linux-fbdev-devel, adaplas, Magnus Damm, armbru, lethal,
	Geert Uytterhoeven

On Sat, 2009-01-17 at 06:14 +0800, ext Jaya Kumar wrote:
> On Fri, Jan 16, 2009 at 7:08 PM, Magnus Damm <magnus.damm@gmail.com> wrote:
> > Any examples of non deferred io use cases? =)
> 
> Yes, I'm glad you asked. The first one that came to mind is the NO-MMU
> case. As you know, defio is MMU only today and I have no hopes of
> removing that. I had damage in mind especially for these NO-MMU cases
> (btw, if any vendor of such devices/cpus/boards is reading, please
> drop me a mail, i would like to help support this ).
> 
> Okay, so the above was the easy answer. There are also others I have
> in mind but it is debatable whether they should use damage API or
> whether they should use deferred IO. I would like to discuss the range
> of scenarios here:
> 
> a) Tomi raised omapfb at the start of this thread. He or she mentioned:

He =).

> OMAPFB_UPDATE_WINDOW
> I looked thru the code and saw:
> 
> +static int omapfb_update_window(struct fb_info *fbi,
> +               u32 x, u32 y, u32 w, u32 h)
> 
> [ btw, interesting to see use of u32 above, why not just u16? ]
> 
> I noticed dsi_update_screen_dispc. After reading this code, I formed
> the following conclusion:
> - this is to support the use of externally buffered displays. that is,
> there is an external sdram being handled by a separate controller,
> probably a MIPI-DSI controller
> - basically omapfb wants to know exactly what and when stuff is
> written from userspace because it has to push that manually through
> the MIPI-DSI interface
> 
> That driver currently uses a private ioctl to achieve that through the
> transfer of a single rectangle from userspace. It could, I believe,
> achieve the same effect using deferred IO since it has an MMU but lets
> leave that to one side for now. This kind of driver would be able to
> use the damage API with little change. They would add a GETDAMAGE
> handler that reports back their max rectangles (1) and then a
> PUTDAMAGE handler that does what they already do today.

You are obviously reading the new, not yet merged, display subsystem
code that I've been writing. Your analysis is correct.

Both MIPI DSI (in command mode) and MIPI DBI (or RFBI) are "manual
update" displays, so, as you said, there's an external framebuffer with
its own RAM which refreshes the LCD independently, and OMAP pushes the
pixels to the ext FB only when needed.

There's one more ioctl related to this, OMAPFB_SYNC_GFX. This ioctl will
wait until the ongoing update has been done.

The DSI implementation does not queue the updates in any way: if there
is an update ongoing when a new update ioctl is issued, omapfb will
return EBUSY. The old implementation for DBI in the current linux-omap
does queue the updates up to n (~5?) updates, but I didn't want to
implement queuing without knowing that it is really needed. I still
don't know =).

And an extra complexity comes from so called Tearing Elimination, in
which the ext FB informs OMAP when the LCD is drawing particular line,
usually the first or last line on screen. OMAP can then synchronize the
start of an update for this TE signal to prevent visible tearing on
screen. And having TE enabled and queuing the updates it could take
multiple frames until the update is on the screen. Probably not
intended.

We don't use deferred io, that is also something I should take time to
study at some point. And generally, I'm quite at loss which would be the
most efficient way of handling the updates. I guess I should just
implement all the possible options and do benchmarking.

Btw, how does the X damage extension work? I suppose you need some extra
code in X for it to be able to report the changes to the actual driver
below? And in this extra code you can choose how to report the damages
to the driver, either giving them one by one or combining them to a
larger area?

> Thanks, u2,
> jaya

 Tomi

------------------------------------------------------------------------------
This SF.net email is sponsored by:
SourcForge Community
SourceForge wants to tell your story.
http://p.sf.net/sfu/sf-spreadtheword

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2009-01-22 22:22 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-15  0:06 [RFC 2.6.28 1/2] fbdev: add ability to set damage Jaya Kumar
2009-01-15  0:06 ` [RFC 2.6.28 2/2] broadsheetfb: add damage handling Jaya Kumar
2009-01-15  9:25 ` [RFC 2.6.28 1/2] fbdev: add ability to set damage Tomi Valkeinen
2009-01-15  9:53   ` Jaya Kumar
2009-01-15 10:29     ` Magnus Damm
2009-01-15 11:08       ` Jaya Kumar
2009-01-16  3:09         ` Magnus Damm
2009-01-16  9:24           ` Jaya Kumar
2009-01-16 11:08             ` Magnus Damm
2009-01-16 22:14               ` Jaya Kumar
2009-01-19  4:44                 ` Magnus Damm
2009-01-19 15:15                   ` Jaya Kumar
2009-01-20  4:17                     ` Magnus Damm
2009-01-20  4:21                       ` Mikhail Gusarov
2009-01-20  4:34                         ` Magnus Damm
2009-01-20 10:22                           ` Michal Suchanek
2009-01-22 21:51                           ` Jaya Kumar
2009-01-19 12:59                 ` Tomi Valkeinen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).