[Qemu-devel] [PATCH] Reducing X communication bandwidth, take 2

* [Qemu-devel] [PATCH] Reducing X communication bandwidth, take 2
@ 2007-03-14  1:52 Julian Seward
  2007-03-14  3:39 ` Anthony Liguori
  2007-03-16 18:40 ` Anthony Liguori
  0 siblings, 2 replies; 11+ messages in thread
From: Julian Seward @ 2007-03-14  1:52 UTC (permalink / raw)
  To: qemu-devel


Here is a somewhat revised version of a patch I first made nearly
three years ago.  The original thread is 

http://lists.gnu.org/archive/html/qemu-devel/2004-07/msg00263.html

The patch makes QEMU's graphics emulation much more usable over remote
X connections, by reducing the amount of data sent to the X server.
This is particularly noticeable for small display updates, most
importantly mouse cursor movements, which become faster and so 
generally make the guest's GUI more pleasant to use.

Compared to the original patch, this one:

- is relative to 0.9.0

- handle screen->format->BytesPerPixel values of both 2 and 4,
  so handles most guest depths - I tested 8, 16, 24bpp.

- has unrolled comparison/copy loops for the depth 2 case.  Most
  of the comparisons are short (<= 64 bytes) so I don't see much
  point in taking the overhead of a call to memcmp/memcpy.

- most importantly, is optional and disabled by default, so that
  default performance is unchanged.  To use it you need the new
  -remote-x11 flag (perhaps -low-bandwidth-x11 would be a better 
  name).

It still uses a shadow frame buffer.  Fabrice mentioned this is not
necessary

 http://lists.gnu.org/archive/html/qemu-devel/2004-07/msg00279.html

but I can't see how to get rid of it and still check for redundant
updates in NxN pixel blocks (N=32 by default).  The point of checking
NxN squares is that mouse pointer pixmaps are square and so the most
common display updates - mouse pointer movements - are often reduced
to transmission of a single 32x32 block using this strategy.

The shadow framebuffer is only allocated when -remote-x11 is present,
so the patch has no effect on default memory use.

I measured the bandwidth saving roughly by resuming a vm snapshot
containing a web browser showing a page with a lot of links.  I moved
the pointer slowly over the links (so they change colour) and scrolled
up and down a bit; about 1/2 minute of activity in total.  I tried to do
the same with and without -remote-x11.  Without -remote-x11, 163Mbyte
was transmitted to the X server; with it, 20.6Mbyte was, about an 8:1
reduction.

J


diff -u -r Orig/qemu-0.9.0/sdl.c qemu-0.9.0/sdl.c

--- Orig/qemu-0.9.0/sdl.c	2007-02-05 23:01:54.000000000 +0000
+++ qemu-0.9.0/sdl.c	2007-03-13 22:16:40.000000000 +0000
@@ -29,6 +29,8 @@
 #include <signal.h>
 #endif
 
+#include <assert.h>
+
 static SDL_Surface *screen;
 static int gui_grab; /* if true, all keyboard/mouse events are grabbed */
 static int last_vm_running;
@@ -44,17 +46,232 @@
 static SDL_Cursor *sdl_cursor_hidden;
 static int absolute_enabled = 0;
 
+/* Mechanism to reduce the total amount of data transmitted to the X
+   server, often quite dramatically.  Keep a shadow copy of video
+   memory in alt_pixels, and when asked to update a rectangle, use
+   the shadow copy to establish areas which are the same, and so do
+   not need updating.
+*/
+
+static void* alt_pixels = NULL;
+
+#define THRESH 32
+
+/* Return 1 if the area [x .. x+w-1, y .. y+w-1] is different from
+   the old version and so needs updating. */
+static int cmpArea_16bit ( int x, int y, int w, int h )
+{
+           int    i, j;
+  unsigned int    sll;
+  unsigned short* p1base = (unsigned short*)screen->pixels;
+  unsigned short* p2base = (unsigned short*)alt_pixels;
+  assert(screen->format->BytesPerPixel == 2);
+  if (w == 0 || h == 0)
+     return 0;
+  assert(w > 0 && h > 0);
+  sll = ((unsigned int)screen->pitch) >> 1;
+  for (j = y; j < y+h; j++) {
+    unsigned short* p1p = & p1base[j * sll + x];
+    unsigned short* p2p = & p2base[j * sll + x];
+    for (i = 0; i < w-5; i += 5) {
+      if (p1p[i+0] != p2p[i+0]) return 1;
+      if (p1p[i+1] != p2p[i+1]) return 1;
+      if (p1p[i+2] != p2p[i+2]) return 1;
+      if (p1p[i+3] != p2p[i+3]) return 1;
+      if (p1p[i+4] != p2p[i+4]) return 1;
+    }
+    for (/*fixup*/; i < w; i++) {
+      if (p1p[i+0] != p2p[i+0]) return 1;
+    }
+  }
+  return 0;
+}
+static void copyArea_16bit ( int x, int y, int w, int h )
+{
+           int    i, j;
+  unsigned int    sll;
+  unsigned short* p1base = (unsigned short*)screen->pixels;
+  unsigned short* p2base = (unsigned short*)alt_pixels;
+  assert(screen->format->BytesPerPixel == 2);
+  sll = ((unsigned int)screen->pitch) >> 1;
+  if (w == 0 || h == 0)
+     return;
+  assert(w > 0 && h > 0);
+  for (j = y; j < y+h; j++) {
+    unsigned short* p1p = & p1base[j * sll + x];
+    unsigned short* p2p = & p2base[j * sll + x];
+    for (i = 0; i < w-5; i += 5) {
+      p2p[i+0] = p1p[i+0];
+      p2p[i+1] = p1p[i+1];
+      p2p[i+2] = p1p[i+2];
+      p2p[i+3] = p1p[i+3];
+      p2p[i+4] = p1p[i+4];
+    }
+    for (/*fixup*/; i < w; i++) {
+      p2p[i+0] = p1p[i+0];
+    }
+  }
+}
+
+static int cmpArea_32bit ( int x, int y, int w, int h )
+{
+  int           i, j;
+  unsigned int  sll;
+  unsigned int* p1base = (unsigned int*)screen->pixels;
+  unsigned int* p2base = (unsigned int*)alt_pixels;
+  assert(screen->format->BytesPerPixel == 4);
+  sll = ((unsigned int)screen->pitch) >> 2;
+  for (j = y; j < y+h; j++) {
+    for (i = x; i < x+w; i++) {
+      if (p1base [j * sll +i] != p2base [j * sll +i])
+	return 1;
+    }
+  }
+  return 0;
+}
+static void copyArea_32bit ( int x, int y, int w, int h )
+{
+  int           i, j;
+  unsigned int  sll;
+  unsigned int* p1base = (unsigned int*)screen->pixels;
+  unsigned int* p2base = (unsigned int*)alt_pixels;
+  assert(screen->format->BytesPerPixel == 4);
+  sll = ((unsigned int)screen->pitch) >> 2;
+  for (j = y; j < y+h; j++) {
+    for (i = x; i < x+w; i++) {
+      p2base [j * sll +i] = p1base [j * sll +i];
+    }
+  }
+}
+
+
+static void econoUpdate (DisplayState *ds, int x, int y, 
+                         unsigned int w, unsigned int h)
+{
+   static int tested_total = 0;
+   static int update_total = 0;
+
+   int  (*cmpArea) (int, int, int, int) = NULL;
+   void (*copyArea)(int, int, int, int) = NULL;
+
+   int xi, xj, xlim, yi, yj, ylim, ntest, nupd;
+   if (w == 0 || h == 0) 
+      return;
+
+   if (screen->format->BytesPerPixel == 4) {
+      cmpArea = cmpArea_32bit;
+      copyArea = copyArea_32bit;
+   }
+   else
+   if (screen->format->BytesPerPixel == 2) {
+      cmpArea = cmpArea_16bit;
+      copyArea = copyArea_16bit;
+   }
+   else
+      assert(0); /* should not be reached - sdl_update guards against this */
+
+   xlim = x + w;
+   ylim = y + h;
+
+   ntest = nupd = 0;
+   for (xi = x; xi < xlim; xi += THRESH) {
+      xj = xi + THRESH;
+      if (xj > xlim) xj = xlim;
+      for (yi = y; yi < ylim; yi += THRESH) {
+         yj = yi + THRESH;
+         if (yj > ylim) yj = ylim;
+         if (xj-xi == 0 || yj-yi == 0)
+            continue;
+	 ntest++;
+	 if (cmpArea(xi, yi, xj-xi, yj-yi)) {
+ 	    nupd++;
+            copyArea(xi, yi, xj-xi, yj-yi);
+            SDL_UpdateRect(screen, xi, yi, xj-xi, yj-yi);
+	 }
+      }
+   }
+   tested_total += ntest;
+   update_total += nupd;
+   if (0)
+   printf("(tested, updated):   total (%d, %d),   this time (%d, %d)\n", 
+          tested_total, update_total, ntest, nupd);
+}
+
+
 static void sdl_update(DisplayState *ds, int x, int y, int w, int h)
 {
-    //    printf("updating x=%d y=%d w=%d h=%d\n", x, y, w, h);
-    SDL_UpdateRect(screen, x, y, w, h);
+   int warned = 0;
+
+   //printf("updating x=%d y=%d w=%d h=%d\n", x, y, w, h);
+   //   printf("Total Size    %d %d\n", screen->w, screen->h);
+   //printf("BytesPerPixel %d\n", screen->format->BytesPerPixel);
+   //printf("pitch         %d (%d)\n", screen->pitch, 
+   //                       screen->w * screen->format->BytesPerPixel);
+
+   if (!remote_x11) {
+      SDL_UpdateRect(screen, x, y, w, h);
+      return;
+   }
+
+   if ( (screen->format->BytesPerPixel != 4 
+         && screen->format->BytesPerPixel != 2)
+       || screen->pitch != screen->w * screen->format->BytesPerPixel) {
+       if (!warned) {
+          warned = 1;
+          fprintf(stderr, "qemu: SDL update optimisation disabled\n"
+                          "      (wrong bits-per-pixel, or wrong pitch)\n");
+       }
+       SDL_UpdateRect(screen, x, y, w, h);
+       return;
+    }
+
+    assert(screen->pitch == screen->w * screen->format->BytesPerPixel);
+    assert(sizeof(int) == 4);
+    assert(sizeof(short) == 2);
+    if (alt_pixels == NULL) {
+       /* First time through (at this resolution).  
+          Copy entire screen. */
+       if (screen->format->BytesPerPixel == 4) {
+          int i, word32s = screen->w * screen->h;
+          if (0) printf("copying init screen - 32 bit\n");
+          alt_pixels = malloc(word32s * sizeof(int));
+          assert(alt_pixels);
+          for (i = 0; i < word32s; i++) 
+             ((unsigned int*)alt_pixels)[i] 
+                = ((unsigned int*)(screen->pixels))[i];
+          SDL_UpdateRect(screen, x, y, w, h);
+          if (0) printf("done- 32 bit\n");
+       }
+       else
+       if (screen->format->BytesPerPixel == 2) {
+          int i, word16s = screen->w * screen->h;
+          if (0) printf("copying init screen - 16 bit\n");
+          alt_pixels = malloc(word16s * sizeof(int));
+          assert(alt_pixels);
+          for (i = 0; i < word16s; i++) 
+             ((unsigned short*)alt_pixels)[i] 
+                = ((unsigned short*)(screen->pixels))[i];
+          SDL_UpdateRect(screen, x, y, w, h);
+          if (0) printf("done - 16 bit\n");
+       }
+       else 
+          assert(0); /* guarded above */
+    } else {
+       assert(w >= 0);
+       assert(h >= 0);
+       econoUpdate(ds, x, y, w, h);
+    }
 }
 
 static void sdl_resize(DisplayState *ds, int w, int h)
 {
     int flags;
 
-    //    printf("resizing to %d %d\n", w, h);
+    if (alt_pixels) {
+        assert(remote_x11);
+        free(alt_pixels);
+	alt_pixels = NULL;
+    }
 
     flags = SDL_HWSURFACE|SDL_ASYNCBLIT|SDL_HWACCEL;
     if (gui_fullscreen)
diff -u -r Orig/qemu-0.9.0/vl.c qemu-0.9.0/vl.c
--- Orig/qemu-0.9.0/vl.c	2007-02-05 23:01:54.000000000 +0000
+++ qemu-0.9.0/vl.c	2007-03-13 22:12:39.000000000 +0000
@@ -173,6 +173,7 @@
 int nb_option_roms;
 int semihosting_enabled = 0;
 int autostart = 1;
+int remote_x11 = 0;
 
 /***********************************************************/
 /* x86 ISA bus support */
@@ -6121,6 +6122,7 @@
 	   "-vnc display    start a VNC server on display\n"
 #ifndef _WIN32
 	   "-daemonize      daemonize QEMU after initializing\n"
+           "-remote-x11     improve graphics responsiveness when X11 
client != server\n"
 #endif
 	   "-option-rom rom load a file, rom, into the option ROM space\n"
            "\n"
@@ -6204,6 +6206,7 @@
     QEMU_OPTION_no_acpi,
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_daemonize,
+    QEMU_OPTION_remote_x11,
     QEMU_OPTION_option_rom,
     QEMU_OPTION_semihosting
 };
@@ -6288,6 +6291,7 @@
     { "no-acpi", 0, QEMU_OPTION_no_acpi },
     { "no-reboot", 0, QEMU_OPTION_no_reboot },
     { "daemonize", 0, QEMU_OPTION_daemonize },
+    { "remote-x11", 0, QEMU_OPTION_remote_x11 },
     { "option-rom", HAS_ARG, QEMU_OPTION_option_rom },
 #if defined(TARGET_ARM)
     { "semihosting", 0, QEMU_OPTION_semihosting },
@@ -6947,6 +6951,9 @@
 	    case QEMU_OPTION_daemonize:
 		daemonize = 1;
 		break;
+	    case QEMU_OPTION_remote_x11:
+		remote_x11 = 1;
+		break;
 	    case QEMU_OPTION_option_rom:
 		if (nb_option_roms >= MAX_OPTION_ROMS) {
 		    fprintf(stderr, "Too many option ROMs\n");
diff -u -r Orig/qemu-0.9.0/vl.h qemu-0.9.0/vl.h
--- Orig/qemu-0.9.0/vl.h	2007-02-05 23:01:54.000000000 +0000
+++ qemu-0.9.0/vl.h	2007-03-13 22:11:24.000000000 +0000
@@ -159,6 +159,7 @@
 extern int no_quit;
 extern int semihosting_enabled;
 extern int autostart;
+extern int remote_x11;
 
 #define MAX_OPTION_ROMS 16
 extern const char *option_rom[MAX_OPTION_ROMS];

^ permalink raw reply	[flat|nested] 11+ messages in thread