All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/9] lguest: block device speedup
@ 2007-03-09  3:05 Rusty Russell
  2007-03-09  3:08 ` [PATCH 2/9] lguest: bridging support in example code Rusty Russell
  2007-03-09  7:51 ` [PATCH 1/9] lguest: block device speedup Christoph Hellwig
  0 siblings, 2 replies; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:05 UTC (permalink / raw)
  To: Andrew Morton, Andi Kleen; +Cc: lkml - Kernel Mailing List, Jens Axboe

Jens Axboe pointed out that end_request() does not end the entire
request.  Go figure.  On the upside, he wrote the replacement for me!
Now we do far less block traffic, and our performance sucks less.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r fdc8cbc1fd61 drivers/block/lguest_blk.c
--- a/drivers/block/lguest_blk.c	Thu Mar 08 13:35:39 2007 +1100
+++ b/drivers/block/lguest_blk.c	Thu Mar 08 15:51:55 2007 +1100
@@ -45,6 +45,16 @@ struct blockdev
 	struct request *req;
 };
 
+/* Jens gave me this nice helper to end all chunks of a request. */
+static void end_entire_request(struct request *req, int uptodate)
+{
+	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
+		BUG();
+	add_disk_randomness(req->rq_disk);
+	blkdev_dequeue_request(req);
+	end_that_request_last(req, uptodate);
+}
+
 static irqreturn_t lgb_irq(int irq, void *_bd)
 {
 	struct blockdev *bd = _bd;
@@ -61,7 +71,7 @@ static irqreturn_t lgb_irq(int irq, void
 	}
 
 	spin_lock_irqsave(&bd->lock, flags);
-	end_request(bd->req, bd->lb_page->result == 1);
+	end_entire_request(bd->req, bd->lb_page->result == 1);
 	bd->req = NULL;
 	bd->dma.used_len = 0;
 	blk_start_queue(bd->disk->queue);
@@ -149,7 +159,7 @@ again:
 		pr_debug("Got non-command 0x%08x\n", req->cmd_type);
 	error:
 		req->errors++;
-		end_request(req, 0);
+		end_entire_request(req, 0);
 		goto again;
 	} else {
 		if (rq_data_dir(req) == WRITE)



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/9] lguest: bridging support in example code
  2007-03-09  3:05 [PATCH 1/9] lguest: block device speedup Rusty Russell
@ 2007-03-09  3:08 ` Rusty Russell
  2007-03-09  3:12   ` [PATCH 3/9] lguest: cleanup: allocate separate pages for switcher code Rusty Russell
  2007-03-09  7:51 ` [PATCH 1/9] lguest: block device speedup Christoph Hellwig
  1 sibling, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:08 UTC (permalink / raw)
  To: Andrew Morton, Andi Kleen; +Cc: lkml - Kernel Mailing List

Expand the --tunnet option to take a bridge name as an argument, so that
the tap interface is added to the specified bridge.  This makes it
convenient to use bridging for connecting the guest to external networks.
    
Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r cff3d561d1b0 Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Thu Mar 08 15:52:15 2007 +1100
+++ b/Documentation/lguest/lguest.c	Thu Mar 08 16:08:36 2007 +1100
@@ -23,7 +23,8 @@
 #include <sys/time.h>
 #include <time.h>
 #include <netinet/in.h>
-#include <linux/if.h>
+#include <net/if.h>
+#include <linux/sockios.h>
 #include <linux/if_tun.h>
 #include <sys/uio.h>
 #include <termios.h>
@@ -36,6 +37,7 @@ typedef uint8_t u8;
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
+#define BRIDGE_PFX "bridge:"
 
 static bool verbose;
 #define verbose(args...) \
@@ -582,20 +584,16 @@ static u32 handle_block_output(int fd, c
 	((u8)(ip >> 8)),			\
 	((u8)(ip))
 
-static void configure_device(const char *devname, u32 ipaddr,
+static void configure_device(int fd, const char *devname, u32 ipaddr,
 			     unsigned char hwaddr[6])
 {
 	struct ifreq ifr;
-	int fd;
 	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
 
 	memset(&ifr, 0, sizeof(ifr));
 	strcpy(ifr.ifr_name, devname);
 	sin->sin_family = AF_INET;
 	sin->sin_addr.s_addr = htonl(ipaddr);
-	fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (fd < 0)
-		err(1, "opening IP socket");
 	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
 		err(1, "Setting %s interface address", devname);
 	ifr.ifr_flags = IFF_UP;
@@ -724,13 +722,34 @@ static u32 str2ip(const char *ipaddr)
 	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
 }
 
-static void setup_tun_net(const char *ipaddr,
+/* adapted from libbridge */
+static void add_to_bridge(int fd, const char *if_name, const char *br_name)
+{
+	int ifidx;
+	struct ifreq ifr;
+
+	if (!*br_name)
+		errx(1, "must specify bridge name");
+
+	ifidx = if_nametoindex(if_name);
+	if (!ifidx)
+		errx(1, "interface %s does not exist!", if_name);
+
+	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+	ifr.ifr_ifindex = ifidx;
+	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
+		err(1, "can't add %s to bridge %s", if_name, br_name);
+}
+
+static void setup_tun_net(const char *arg,
 			  struct lguest_device_desc *descs,
 			  struct devices *devices)
 {
 	struct device *dev;
 	struct ifreq ifr;
-	int netfd;
+	int netfd, ipfd;
+	u32 ipaddr;
+	const char *br_name = NULL;
 
 	netfd = open("/dev/net/tun", O_RDWR);
 	if (netfd < 0)
@@ -748,15 +767,29 @@ static void setup_tun_net(const char *ip
 	dev->priv = malloc(sizeof(bool));
 	*(bool *)dev->priv = false;
 
+	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (ipfd < 0)
+		err(1, "opening IP socket");
+
+	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
+		ipaddr = INADDR_ANY;
+		br_name = arg + strlen(BRIDGE_PFX);
+		add_to_bridge(ipfd, ifr.ifr_name, br_name);
+	} else
+		ipaddr = str2ip(arg);
+
 	/* We are peer 0, rest is all NO_GUEST */
 	memset(dev->mem, 0xFF, getpagesize());
-	configure_device(ifr.ifr_name, str2ip(ipaddr), dev->mem);
+	configure_device(ipfd, ifr.ifr_name, ipaddr, dev->mem);
+	close(ipfd);
 
 	/* You will be peer 1: we should create enough jitter to randomize */
 	dev->desc->features = NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS;
 	verbose("device %p@%p: tun net %u.%u.%u.%u\n", dev->desc,
 		(void *)(dev->desc->pfn * getpagesize()),
-		HIPQUAD(str2ip(ipaddr)));
+		HIPQUAD(ipaddr));
+	if (br_name)
+		verbose("attched to bridge: %s\n", br_name);
 }
 
 static void setup_block_file(const char *filename,
@@ -887,8 +920,8 @@ int main(int argc, char *argv[])
 
 	if (argc < 4)
 		errx(1, "Usage: lguest [--verbose] <mem> vmlinux "
-			"[--sharenet=<filename>|--tunnet=<ipaddr>|--block=<filename>"
-			"|--initrd=<filename>]... [args...]");
+			"[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)"
+			"|--block=<filename>|--initrd=<filename>]... [args...]");
 
 	zero_fd = open("/dev/zero", O_RDONLY, 0);
 	if (zero_fd < 0)
diff -r cff3d561d1b0 Documentation/lguest/lguest.txt
--- a/Documentation/lguest/lguest.txt	Thu Mar 08 15:52:15 2007 +1100
+++ b/Documentation/lguest/lguest.txt	Thu Mar 08 16:02:49 2007 +1100
@@ -77,10 +77,26 @@ Running Lguest:
   /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
   eth0 inside the guest at 192.168.19.2.
 
+  Another method is to bridge the tap device to an external interface
+  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
+  to obtain an IP address.  The bridge needs to be configured first:
+  this option simply adds the tap interface to it.
+  
+  A simple example on my system:
+  
+    ifconfig eth0 0.0.0.0
+    brctl addbr lg0
+    ifconfig lg0 up
+    dhclient lg0
+    
+  Then use --tunnet=bridge:lg0 when launching the guest.
+
+  See http://linux-net.osdl.org/index.php/Bridge for general information
+  on how to get bridging working.
+
 - You can also create an inter-guest network using
   "--sharenet=<filename>": any two guests using the same file are on
   the same network.  This file is created if it does not exist.
-
 
 Lguest I/O model:
 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/9] lguest: cleanup: allocate separate pages for switcher code
  2007-03-09  3:08 ` [PATCH 2/9] lguest: bridging support in example code Rusty Russell
@ 2007-03-09  3:12   ` Rusty Russell
  2007-03-09  3:16     ` [PATCH 4/9] lguest: cleanup: clean up regs save/restore Rusty Russell
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

We don't need physically-contiguous pages for the hypervisor, since we
use map_vm_area anyway.

Two other related cleanups: pass the number of pages to
init_pagetables() so we can remove the constant from the header, and
call populate_hypervisor_pte_page() on each page as we allocate it,
rather than as a separate loop.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 9fea34a28460 arch/i386/lguest/core.c
--- a/arch/i386/lguest/core.c	Thu Mar 08 16:09:00 2007 +1100
+++ b/arch/i386/lguest/core.c	Thu Mar 08 16:21:42 2007 +1100
@@ -24,17 +24,21 @@ static char __initdata hypervisor_blob[]
 #include "hypervisor-blob.c"
 };
 
-#define MAX_LGUEST_GUESTS						  \
-	(((PAGE_SIZE << HYPERVISOR_PAGE_ORDER) - sizeof(hypervisor_blob)) \
+/* 64k ought to be enough for anybody! */
+#define HYPERVISOR_PAGES (65536 / PAGE_SIZE)
+
+#define MAX_LGUEST_GUESTS						\
+	(((HYPERVISOR_PAGES * PAGE_SIZE) - sizeof(hypervisor_blob))	\
 	 / sizeof(struct lguest_state))
 
 static struct vm_struct *hypervisor_vma;
+/* Pages for hypervisor itself */
+static struct page *hype_page[HYPERVISOR_PAGES];
 static int cpu_had_pge;
 static struct {
 	unsigned long offset;
 	unsigned short segment;
 } lguest_entry __attribute_used__;
-struct page *hype_pages; /* Contiguous pages. */
 struct lguest lguests[MAX_LGUEST_GUESTS];
 DEFINE_MUTEX(lguest_lock);
 
@@ -58,15 +62,19 @@ struct lguest_state *__lguest_states(voi
 
 static __init int map_hypervisor(void)
 {
-	unsigned int i;
-	int err;
-	struct page *pages[HYPERVISOR_PAGES], **pagep = pages;
-
-	hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, HYPERVISOR_PAGE_ORDER);
-	if (!hype_pages)
-		return -ENOMEM;
-
-	hypervisor_vma = __get_vm_area(PAGE_SIZE << HYPERVISOR_PAGE_ORDER,
+	int i, err;
+	struct page **pagep = hype_page;
+
+	for (i = 0; i < ARRAY_SIZE(hype_page); i++) {
+		unsigned long addr = get_zeroed_page(GFP_KERNEL);
+		if (!addr) {
+			err = -ENOMEM;
+			goto free_some_pages;
+		}
+		hype_page[i] = virt_to_page(addr);
+	}
+
+	hypervisor_vma = __get_vm_area(ARRAY_SIZE(hype_page) * PAGE_SIZE,
 				       VM_ALLOC, HYPE_ADDR, VMALLOC_END);
 	if (!hypervisor_vma) {
 		err = -ENOMEM;
@@ -74,9 +82,6 @@ static __init int map_hypervisor(void)
 		goto free_pages;
 	}
 
-	for (i = 0; i < HYPERVISOR_PAGES; i++)
-		pages[i] = hype_pages + i;
-
 	err = map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep);
 	if (err) {
 		printk("lguest: map_vm_area failed: %i\n", err);
@@ -100,14 +105,20 @@ free_vma:
 free_vma:
 	vunmap(hypervisor_vma->addr);
 free_pages:
-	__free_pages(hype_pages, HYPERVISOR_PAGE_ORDER);
+	i = ARRAY_SIZE(hype_page);
+free_some_pages:
+	for (--i; i >= 0; i--)
+		__free_pages(hype_page[i], 0);
 	return err;
 }
 
 static __exit void unmap_hypervisor(void)
 {
+	unsigned int i;
+
 	vunmap(hypervisor_vma->addr);
-	__free_pages(hype_pages, HYPERVISOR_PAGE_ORDER);
+	for (i = 0; i < ARRAY_SIZE(hype_page); i++)
+		__free_pages(hype_page[i], 0);
 }
 
 /* IN/OUT insns: enough to get us past boot-time probing. */
@@ -390,7 +401,7 @@ static int __init init(void)
 	if (err)
 		return err;
 
-	err = init_pagetables(hype_pages);
+	err = init_pagetables(hype_page, HYPERVISOR_PAGES);
 	if (err) {
 		unmap_hypervisor();
 		return err;
diff -r 9fea34a28460 arch/i386/lguest/lg.h
--- a/arch/i386/lguest/lg.h	Thu Mar 08 16:09:00 2007 +1100
+++ b/arch/i386/lguest/lg.h	Thu Mar 08 16:21:42 2007 +1100
@@ -2,9 +2,6 @@
 #define _LGUEST_H
 
 #include <asm/desc.h>
-/* 64k ought to be enough for anybody! */
-#define HYPERVISOR_PAGE_ORDER (16 - PAGE_SHIFT)
-#define HYPERVISOR_PAGES (1 << HYPERVISOR_PAGE_ORDER)
 
 #define GDT_ENTRY_LGUEST_CS	10
 #define GDT_ENTRY_LGUEST_DS	11
@@ -43,7 +40,7 @@ struct lguest_regs
 };
 
 __exit void free_pagetables(void);
-__init int init_pagetables(struct page *hype_pages);
+__init int init_pagetables(struct page **hype_page, int pages);
 
 /* Full 4G segment descriptors, suitable for CS and DS. */
 #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
@@ -122,7 +119,6 @@ struct lguest
 	struct host_trap interrupt[LGUEST_IRQS];
 };
 
-extern struct page *hype_pages; /* Contiguous pages. */
 extern struct lguest lguests[];
 extern struct mutex lguest_lock;
 
diff -r 9fea34a28460 arch/i386/lguest/page_tables.c
--- a/arch/i386/lguest/page_tables.c	Thu Mar 08 16:09:00 2007 +1100
+++ b/arch/i386/lguest/page_tables.c	Thu Mar 08 16:24:56 2007 +1100
@@ -328,9 +328,23 @@ static void free_hypervisor_pte_pages(vo
 		free_page((long)hypervisor_pte_page(i));
 }
 
-static __init int alloc_hypervisor_pte_pages(void)
+static __init void populate_hypervisor_pte_page(int cpu,
+						struct page *hype_page[],
+						int pages)
 {
 	int i;
+	u32 *pte = hypervisor_pte_page(cpu);
+
+	for (i = 0; i < pages; i++) {
+		/* First entry set dynamically in map_trap_page */
+		pte[i+1] = ((page_to_pfn(hype_page[i]) << PAGE_SHIFT) 
+			    | _PAGE_KERNEL_EXEC);
+	}
+}
+
+__init int init_pagetables(struct page **hype_page, int pages)
+{
+	unsigned int i;
 
 	for_each_possible_cpu(i) {
 		hypervisor_pte_page(i) = (u32 *)get_zeroed_page(GFP_KERNEL);
@@ -338,36 +352,11 @@ static __init int alloc_hypervisor_pte_p
 			free_hypervisor_pte_pages();
 			return -ENOMEM;
 		}
+		populate_hypervisor_pte_page(i, hype_page, pages);
 	}
 	return 0;
 }
 
-static __init void populate_hypervisor_pte_page(int cpu)
-{
-	int i;
-	u32 *pte = hypervisor_pte_page(cpu);
-
-	for (i = 0; i < HYPERVISOR_PAGES; i++) {
-		/* First entry set dynamically in map_trap_page */
-		pte[i+1] = ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT)
-			    | _PAGE_KERNEL_EXEC);
-	}
-}
-
-__init int init_pagetables(struct page hype_pages[])
-{
-	int ret;
-	unsigned int i;
-
-	ret = alloc_hypervisor_pte_pages();
-	if (ret)
-		return ret;
-
-	for_each_possible_cpu(i)
-		populate_hypervisor_pte_page(i);
-	return 0;
-}
-
 __exit void free_pagetables(void)
 {
 	free_hypervisor_pte_pages();



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/9] lguest: cleanup: clean up regs save/restore
  2007-03-09  3:12   ` [PATCH 3/9] lguest: cleanup: allocate separate pages for switcher code Rusty Russell
@ 2007-03-09  3:16     ` Rusty Russell
  2007-03-09  3:17       ` [PATCH 5/9] lguest: documentation fixes Rusty Russell
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

We previously put "cr3" in the guest regs restored and saved: the
guest cannot change cr3, so saving it it silly.  Hand it across to the
host<->guest switcher in ebx.

While we're there, only save the host registers we need to; tell GCC
we clobber everything we can.

Finally, and trap 2 (NMI) doesn't supply a error code (we don't handle
NMI yet, but the test is wrong, so fix it before I get confused).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 6efda2f8ac22 arch/i386/lguest/core.c
--- a/arch/i386/lguest/core.c	Thu Mar 08 16:25:07 2007 +1100
+++ b/arch/i386/lguest/core.c	Thu Mar 08 16:51:17 2007 +1100
@@ -260,11 +260,11 @@ static void run_guest_once(struct lguest
 {
 	unsigned int clobber;
 
-	/* Put eflags on stack, lcall does rest. */
+	/* Put eflags on stack, lcall does rest: suitable for iret return. */
 	asm volatile("pushf; lcall *lguest_entry"
-		     : "=a"(clobber), "=d"(clobber)
-		     : "0"(lg->state), "1"(get_idt_table())
-		     : "memory");
+		     : "=a"(clobber), "=d"(clobber), "=b"(clobber)
+		     : "0"(lg->state), "1"(get_idt_table()), "2"(lg->cr3)
+		     : "memory", "%ecx", "%edi", "%esi");
 }
 
 int run_guest(struct lguest *lg, char *__user user)
diff -r 6efda2f8ac22 arch/i386/lguest/hypervisor.S
--- a/arch/i386/lguest/hypervisor.S	Thu Mar 08 16:25:07 2007 +1100
+++ b/arch/i386/lguest/hypervisor.S	Thu Mar 08 16:52:56 2007 +1100
@@ -4,26 +4,17 @@
 #include <asm/asm-offsets.h>
 #include "lg.h"
 
-#define SAVE_REGS				\
-	/* Save old guest/host state */		\
-	pushl	%es;				\
-	pushl	%ds;				\
-	pushl	%fs;				\
-	pushl	%eax;				\
-	pushl	%gs;				\
-	pushl	%ebp;				\
-	pushl	%edi;				\
-	pushl	%esi;				\
-	pushl	%edx;				\
-	pushl	%ecx;				\
-	pushl	%ebx;				\
-
 .text
 ENTRY(_start) /* ld complains unless _start is defined. */
-/* %eax contains ptr to target guest state, %edx contains host idt. */
+/* %eax contains ptr to target guest state, %edx contains host idt.
+   %ebx contains cr3 value.  All normal registers can be clobbered! */
 switch_to_guest:
-	pushl	%ss
-	SAVE_REGS
+	pushl	%es
+	pushl	%ds
+	pushl	%fs
+	pushl	%gs
+	pushl	%edx
+	pushl	%ebp
 	/* Save old stack, switch to guest's stack. */
 	movl	%esp, LGUEST_STATE_host_stackptr(%eax)
 	movl	%eax, %esp
@@ -33,17 +24,16 @@ switch_to_guest:
 	lgdt	LGUEST_STATE_gdt(%eax)
 	lidt	LGUEST_STATE_idt(%eax)
 	/* Save page table top. */
-	movl	%cr3, %ebx
-	movl	%ebx, LGUEST_STATE_host_pgdir(%eax)
+	movl	%cr3, %ecx
+	movl	%ecx, LGUEST_STATE_host_pgdir(%eax)
 	/* Set host's TSS to available (clear byte 5 bit 2). */
-	movl	(LGUEST_STATE_host_gdt+2)(%eax), %ebx
-	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx)
+	movl	(LGUEST_STATE_host_gdt+2)(%eax), %ecx
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%ecx)
 	/* Switch to guest page tables */
-	popl	%ebx
 	movl	%ebx, %cr3
 	/* Switch to guest's TSS. */
-	movl	$(GDT_ENTRY_TSS*8), %ebx
-	ltr	%bx
+	movl	$(GDT_ENTRY_TSS*8), %edx
+	ltr	%dx
 	/* Restore guest regs */
 	popl	%ebx
 	popl	%ecx
@@ -66,10 +56,18 @@ switch_to_guest:
 	iret
 
 #define SWITCH_TO_HOST							\
-	SAVE_REGS;							\
-	/* Save old pgdir */						\
-	movl	%cr3, %eax;						\
+	/* Save guest state */						\
+	pushl	%es;							\
+	pushl	%ds;							\
+	pushl	%fs;							\
 	pushl	%eax;							\
+	pushl	%gs;							\
+	pushl	%ebp;							\
+	pushl	%edi;							\
+	pushl	%esi;							\
+	pushl	%edx;							\
+	pushl	%ecx;							\
+	pushl	%ebx;							\
 	/* Load lguest ds segment for convenience. */			\
 	movl	$(LGUEST_DS), %eax;					\
 	movl	%eax, %ds;						\
@@ -88,21 +86,15 @@ switch_to_guest:
 	/* Switch to host's stack. */					\
 	movl	LGUEST_STATE_host_stackptr(%eax), %esp;			\
 	/* Switch to host's TSS */					\
-	movl	$(GDT_ENTRY_TSS*8), %eax;				\
-	ltr	%ax;							\
+	movl	$(GDT_ENTRY_TSS*8), %ebx;				\
+	ltr	%bx;							\
 	/* Restore host regs */						\
-	popl	%ebx;							\
-	popl	%ecx;							\
+	popl	%ebp;							\
 	popl	%edx;							\
-	popl	%esi;							\
-	popl	%edi;							\
-	popl	%ebp;							\
 	popl	%gs;							\
-	popl	%eax;							\
 	popl	%fs;							\
 	popl	%ds;							\
-	popl	%es;							\
-	popl	%ss
+	popl	%es
 
 /* Return to run_guest_once. */
 return_to_host:
@@ -135,7 +127,7 @@ deliver_to_host_with_errcode:
 .macro IRQ_STUB N TARGET
 	.data; .long 1f; .text; 1:
  /* Make an error number for most traps, which don't have one. */
- .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+ .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
 	pushl	$0
  .endif
 	pushl	$\N
diff -r 6efda2f8ac22 arch/i386/lguest/lg.h
--- a/arch/i386/lguest/lg.h	Thu Mar 08 16:25:07 2007 +1100
+++ b/arch/i386/lguest/lg.h	Thu Mar 08 16:51:17 2007 +1100
@@ -24,7 +24,6 @@ struct lguest_regs
 struct lguest_regs
 {
 	/* Manually saved part. */
-	u32 cr3;
 	u32 ebx, ecx, edx;
 	u32 esi, edi, ebp;
 	u32 gs;
@@ -81,6 +80,7 @@ struct lguest
 	u32 pfn_limit;
 	u32 page_offset;
 	u32 cr2;
+	u32 cr3;
 	int timer_on;
 	int halted;
 	int ts;
diff -r 6efda2f8ac22 arch/i386/lguest/lguest_user.c
--- a/arch/i386/lguest/lguest_user.c	Thu Mar 08 16:25:07 2007 +1100
+++ b/arch/i386/lguest/lguest_user.c	Thu Mar 08 16:51:17 2007 +1100
@@ -4,7 +4,7 @@
 #include <linux/fs.h>
 #include "lg.h"
 
-static struct lguest_state *setup_guest_state(unsigned int num, void *pgdir,
+static struct lguest_state *setup_guest_state(unsigned int num,
 					      unsigned long start)
 {
 	struct lguest_state *guest = &__lguest_states()[num];
@@ -38,7 +38,6 @@ static struct lguest_state *setup_guest_
 
 	/* Write out stack in format lguest expects, so we can switch to it. */
 	regs = &guest->regs;
-	regs->cr3 = __pa(pgdir);
 	regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0;
 	regs->edi = LGUEST_MAGIC_EDI;
 	regs->ebp = LGUEST_MAGIC_EBP;
@@ -149,7 +148,7 @@ static int initialize(struct file *file,
 	if (err)
 		goto free_trap_page;
 
-	lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+	lg->state = setup_guest_state(i, args[2]);
 	if (!lg->state) {
 		err = -ENOEXEC;
 		goto release_pgtable;
diff -r 6efda2f8ac22 arch/i386/lguest/page_tables.c
--- a/arch/i386/lguest/page_tables.c	Thu Mar 08 16:25:07 2007 +1100
+++ b/arch/i386/lguest/page_tables.c	Thu Mar 08 16:51:17 2007 +1100
@@ -223,7 +223,7 @@ void guest_new_pagetable(struct lguest *
 	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
 		newpgdir = new_pgdir(lg, pgtable);
 	lg->pgdidx = newpgdir;
-	lg->state->regs.cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
+	lg->cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
 	pin_stack_pages(lg);
 }
 
@@ -296,6 +296,7 @@ int init_guest_pagetable(struct lguest *
 	lg->pgdirs[lg->pgdidx].pgdir = (u32*)get_zeroed_page(GFP_KERNEL);
 	if (!lg->pgdirs[lg->pgdidx].pgdir)
 		return -ENOMEM;
+	lg->cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 5/9] lguest: documentation fixes
  2007-03-09  3:16     ` [PATCH 4/9] lguest: cleanup: clean up regs save/restore Rusty Russell
@ 2007-03-09  3:17       ` Rusty Russell
  2007-03-09  3:19         ` [PATCH 6/9] lguest: pin stack page optimization Rusty Russell
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

1: It helps if you connect the bridge to a link.

Signed-off-by: James Morris <jmorris@namei.org>

2: You can theoretically run lguest with no boot parameters.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 90134cf1fe0a Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Wed Feb 21 11:32:59 2007 +1100
+++ b/Documentation/lguest/lguest.c	Wed Feb 21 11:51:47 2007 +1100
@@ -938,7 +938,7 @@ int main(int argc, char *argv[])
 		argc--;
 	}
 
-	if (argc < 4)
+	if (argc < 3)
 		errx(1, "Usage: lguest [--verbose] <mem> vmlinux "
 			"[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)"
 			"|--block=<filename>|--initrd=<filename>]... [args...]");
diff -r 90134cf1fe0a Documentation/lguest/lguest.txt
--- a/Documentation/lguest/lguest.txt	Wed Feb 21 11:32:59 2007 +1100
+++ b/Documentation/lguest/lguest.txt	Wed Feb 21 11:50:55 2007 +1100
@@ -90,6 +90,7 @@ Running Lguest:
     ifconfig eth0 0.0.0.0
     brctl addbr lg0
     ifconfig lg0 up
+    brctl addif lg0 eth0
     dhclient lg0
     
   Then use --tunnet=bridge:lg0 when launching the guest.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 6/9] lguest: pin stack page optimization
  2007-03-09  3:17       ` [PATCH 5/9] lguest: documentation fixes Rusty Russell
@ 2007-03-09  3:19         ` Rusty Russell
  2007-03-09  3:23           ` [PATCH 7/9] lguest: use read-only pages rather than segments to protect high-mapped switcher Rusty Russell
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

We make sure that the stack is always mapped in pin_stack_pages by
simply calling demand_page, but that calls get_user_pages() to find
the pfn, which is way overkill since the page is almost certainly
already mapped.  So don't call pin_stack_pages every context switch
(unless genuinely a completely clean context, all the kernel mappings
are kept in sync), and when we do call it, have it check if it needs
to call demand_page().

This speeds guest context switch by 25%:

Before:
Time for one context switch via pipe: 10606 nsec
After:
Time for one context switch via pipe: 7805 nsec
Native:
Time for one context switch via pipe: 4701 nsec

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 06b3a533da77 arch/i386/lguest/page_tables.c
--- a/arch/i386/lguest/page_tables.c	Wed Feb 21 12:20:20 2007 +1100
+++ b/arch/i386/lguest/page_tables.c	Wed Feb 21 18:13:00 2007 +1100
@@ -155,14 +158,29 @@ int demand_page(struct lguest *lg, u32 v
 	return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED);
 }
 
+/* This is much faster than the full demand_page logic. */
+static int page_writable(struct lguest *lg, unsigned long vaddr)
+{
+	u32 *top, *pte;
+
+	top = toplev(lg, lg->pgdidx, vaddr);
+	if (!(*top & _PAGE_PRESENT))
+		return 0;
+
+	pte = pteof(lg, *top, vaddr);
+	return (*pte & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
+}
+
 void pin_stack_pages(struct lguest *lg)
 {
 	unsigned int i;
 	u32 stack = lg->state->tss.esp1;
 
-	for (i = 0; i < lg->stack_pages; i++)
-		if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+	for (i = 0; i < lg->stack_pages; i++) {
+		if (!page_writable(lg, stack - i * PAGE_SIZE)
+		    && !demand_page(lg, stack - i * PAGE_SIZE, 1))
 			kill_guest(lg, "bad stack page %i@%#x", i, stack);
+	}
 }
 
 static unsigned int find_pgdir(struct lguest *lg, u32 pgtable)
@@ -198,7 +216,7 @@ void guest_pagetable_flush_user(struct l
 	flush_user_mappings(lg, lg->pgdidx);
 }
 
-static unsigned int new_pgdir(struct lguest *lg, u32 cr3)
+static unsigned int new_pgdir(struct lguest *lg, u32 cr3, int *blank_pgdir)
 {
 	unsigned int next;
 
@@ -207,6 +225,9 @@ static unsigned int new_pgdir(struct lgu
 		lg->pgdirs[next].pgdir = (u32 *)get_zeroed_page(GFP_KERNEL);
 		if (!lg->pgdirs[next].pgdir)
 			next = lg->pgdidx;
+		else
+			/* There are no mappings: you'll need to re-pin */
+			*blank_pgdir = 1;
 	}
 	lg->pgdirs[next].cr3 = cr3;
 	/* Release all the non-kernel mappings. */
@@ -217,14 +238,15 @@ static unsigned int new_pgdir(struct lgu
 
 void guest_new_pagetable(struct lguest *lg, u32 pgtable)
 {
-	int newpgdir;
+	int newpgdir, repin = 0;
 
 	newpgdir = find_pgdir(lg, pgtable);
 	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
-		newpgdir = new_pgdir(lg, pgtable);
+		newpgdir = new_pgdir(lg, pgtable, &repin);
 	lg->pgdidx = newpgdir;
 	lg->cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
-	pin_stack_pages(lg);
+	if (repin)
+		pin_stack_pages(lg);
 }
 
 static void release_all_pagetables(struct lguest *lg)



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 7/9] lguest: use read-only pages rather than segments to protect high-mapped switcher
  2007-03-09  3:19         ` [PATCH 6/9] lguest: pin stack page optimization Rusty Russell
@ 2007-03-09  3:23           ` Rusty Russell
  2007-03-09  3:30             ` [PATCH 8/9] lguest: Optimize away copy in and out of per-cpu guest pages Rusty Russell
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

The current lguest uses segment limits to ensure that the guest cannot
reach the switcher code at the top of virtual memory.  This is bad for
two reasons:

1) It introduces complexity when the guest wants to use 4G segments
(ie. glibc's __thread support).
2) It doesn't work on x86-64 boxes.

The alternative is used here: in the host we map the actual switcher
code, two per-cpu pages.  The switcher code and one per-cpu page are
read-only: the read-only page contains the saved host state and the
GDT, IDT and TSS the guest is using.  The other per-cpu page is the
stack page for the hypervisor, which is writable by the guest.  This
is where we save the guest registers: it's safe because while we're
doing this we know the (UP) guest isn't running.

Switching into the guest involves copying in the registers, GDT and
IDT to this cpu's pages, the copying the registers out on the way
back.  This is optimized in another patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 7a963f6eef0a arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/kernel/asm-offsets.c	Thu Mar 08 17:21:16 2007 +1100
@@ -122,15 +122,15 @@ void foo(void)
 #ifdef CONFIG_LGUEST_GUEST
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-	OFFSET(LGUEST_STATE_host_stackptr, lguest_state, host.stackptr);
-	OFFSET(LGUEST_STATE_host_pgdir, lguest_state, host.pgdir);
-	OFFSET(LGUEST_STATE_host_gdt, lguest_state, host.gdt);
-	OFFSET(LGUEST_STATE_host_idt, lguest_state, host.idt);
-	OFFSET(LGUEST_STATE_regs, lguest_state, regs);
-	OFFSET(LGUEST_STATE_gdt, lguest_state, gdt);
-	OFFSET(LGUEST_STATE_idt, lguest_state, idt);
-	OFFSET(LGUEST_STATE_gdt_table, lguest_state, gdt_table);
-	OFFSET(LGUEST_STATE_trapnum, lguest_state, regs.trapnum);
-	OFFSET(LGUEST_STATE_errcode, lguest_state, regs.errcode);
+	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
 }
diff -r 7a963f6eef0a arch/i386/lguest/Makefile
--- a/arch/i386/lguest/Makefile	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/Makefile	Thu Mar 08 17:21:16 2007 +1100
@@ -6,8 +6,8 @@ lg-objs := core.o hypercalls.o page_tabl
 lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
 	segments.o io.o lguest_user.o
 
-# We use top 4MB for guest traps page, then hypervisor. */
-HYPE_ADDR := (0xFFC00000+4096)
+# We use top 4MB for hypervisor. */
+HYPE_ADDR := 0xFFC00000
 # The data is only 1k (256 interrupt handler pointers)
 HYPE_DATA_SIZE := 1024
 CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)" -DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
diff -r 7a963f6eef0a arch/i386/lguest/core.c
--- a/arch/i386/lguest/core.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/core.c	Fri Mar 09 13:09:27 2007 +1100
@@ -24,26 +24,26 @@ static char __initdata hypervisor_blob[]
 #include "hypervisor-blob.c"
 };
 
-/* 64k ought to be enough for anybody! */
-#define HYPERVISOR_PAGES (65536 / PAGE_SIZE)
-
-#define MAX_LGUEST_GUESTS						\
-	(((HYPERVISOR_PAGES * PAGE_SIZE) - sizeof(hypervisor_blob))	\
-	 / sizeof(struct lguest_state))
+/* Every guest maps the core hypervisor blob. */
+#define SHARED_HYPERVISOR_PAGES DIV_ROUND_UP(sizeof(hypervisor_blob),PAGE_SIZE)
 
 static struct vm_struct *hypervisor_vma;
-/* Pages for hypervisor itself */
-static struct page *hype_page[HYPERVISOR_PAGES];
+/* Pages for hypervisor itself, then two pages per cpu */
+static struct page *hype_page[SHARED_HYPERVISOR_PAGES+2*NR_CPUS];
+
 static int cpu_had_pge;
 static struct {
 	unsigned long offset;
 	unsigned short segment;
 } lguest_entry __attribute_used__;
+DEFINE_MUTEX(lguest_lock);
+
+/* FIXME: Make dynamic. */
+#define MAX_LGUEST_GUESTS 16
 struct lguest lguests[MAX_LGUEST_GUESTS];
-DEFINE_MUTEX(lguest_lock);
 
 /* IDT entries are at start of hypervisor. */
-const unsigned long *__lguest_default_idt_entries(void)
+static const unsigned long *lguest_default_idt_entries(void)
 {
 	return (void *)HYPE_ADDR;
 }
@@ -54,10 +54,11 @@ static void *__lguest_switch_to_guest(vo
 	return (void *)HYPE_ADDR + HYPE_DATA_SIZE;
 }
 
-/* Then we use everything else to hold guest state. */
-struct lguest_state *__lguest_states(void)
-{
-	return (void *)HYPE_ADDR + sizeof(hypervisor_blob);
+/* This cpu's struct lguest_pages. */
+static struct lguest_pages *lguest_pages(unsigned int cpu)
+{
+	return &(((struct lguest_pages *)
+		  (HYPE_ADDR + SHARED_HYPERVISOR_PAGES*PAGE_SIZE))[cpu]);
 }
 
 static __init int map_hypervisor(void)
@@ -89,8 +90,25 @@ static __init int map_hypervisor(void)
 	}
 	memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob));
 
-	/* Setup LGUEST segments on all cpus */
 	for_each_possible_cpu(i) {
+		struct lguest_pages *pages = lguest_pages(i);
+		struct lguest_ro_state *state = &pages->state;
+
+		/* These fields are static: rest done in copy_in_guest_info */
+		state->host_gdt_desc = per_cpu(cpu_gdt_descr, i);
+		store_idt(&state->host_idt_desc);
+		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
+		state->guest_idt_desc.address = (long)&state->guest_idt;
+		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
+		state->guest_gdt_desc.address = (long)&state->guest_gdt;
+		state->guest_tss.esp0 = (long)(&pages->regs + 1);
+		state->guest_tss.ss0 = LGUEST_DS;
+		/* No I/O for you! */
+		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+		setup_default_gdt_entries(state);
+		setup_default_idt_entries(state, lguest_default_idt_entries());
+		
+		/* Setup LGUEST segments on all cpus */
 		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
 		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
 	}
@@ -126,10 +144,10 @@ static int emulate_insn(struct lguest *l
 {
 	u8 insn;
 	unsigned int insnlen = 0, in = 0, shift = 0;
-	unsigned long physaddr = guest_pa(lg, lg->state->regs.eip);
+	unsigned long physaddr = guest_pa(lg, lg->regs.eip);
 
 	/* This only works for addresses in linear mapping... */
-	if (lg->state->regs.eip < lg->page_offset)
+	if (lg->regs.eip < lg->page_offset)
 		return 0;
 	lhread(lg, &insn, physaddr, 1);
 
@@ -162,11 +180,11 @@ static int emulate_insn(struct lguest *l
 	if (in) {
 		/* Lower bit tells is whether it's a 16 or 32 bit access */
 		if (insn & 0x1)
-			lg->state->regs.eax = 0xFFFFFFFF;
+			lg->regs.eax = 0xFFFFFFFF;
 		else
-			lg->state->regs.eax |= (0xFFFF << shift);
-	}
-	lg->state->regs.eip += insnlen;
+			lg->regs.eax |= (0xFFFF << shift);
+	}
+	lg->regs.eip += insnlen;
 	return 1;
 }
 
@@ -174,7 +192,7 @@ int find_free_guest(void)
 {
 	unsigned int i;
 	for (i = 0; i < MAX_LGUEST_GUESTS; i++)
-		if (!lguests[i].state)
+		if (!lguests[i].tsk)
 			return i;
 	return -1;
 }
@@ -221,31 +239,6 @@ void lhwrite(struct lguest *lg, u32 addr
 		kill_guest(lg, "bad write address %u len %u", addr, bytes);
 }
 
-/* Saves exporting idt_table from kernel */
-static struct desc_struct *get_idt_table(void)
-{
-	struct Xgt_desc_struct idt;
-
-	asm("sidt %0":"=m" (idt));
-	return (void *)idt.address;
-}
-
-static int usermode(struct lguest_regs *regs)
-{
-	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
-}
-
-/* Trap page resets this when it reloads gs. */
-static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs)
-{
-	u32 eip;
-	get_user(eip, &lg->lguest_data->gs_gpf_eip);
-	if (eip == regs->eip)
-		return 0;
-	put_user(regs->eip, &lg->lguest_data->gs_gpf_eip);
-	return 1;
-}
-
 static void set_ts(unsigned int guest_ts)
 {
 	u32 cr0;
@@ -256,23 +249,51 @@ static void set_ts(unsigned int guest_ts
 	}
 }
 
-static void run_guest_once(struct lguest *lg)
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
 {
 	unsigned int clobber;
 
 	/* Put eflags on stack, lcall does rest: suitable for iret return. */
 	asm volatile("pushf; lcall *lguest_entry"
-		     : "=a"(clobber), "=d"(clobber), "=b"(clobber)
-		     : "0"(lg->state), "1"(get_idt_table()), "2"(lg->cr3)
-		     : "memory", "%ecx", "%edi", "%esi");
+		     : "=a"(clobber), "=b"(clobber)
+		     : "0"(pages), "1"(lg->cr3)
+		     : "memory", "%edx", "%ecx", "%edi", "%esi");
+}
+
+static void copy_in_guest_info(struct lguest_pages *pages,
+			       struct lguest *lg)
+{
+	/* Copy in regs. */
+	pages->regs = lg->regs;
+
+	/* TSS entries for direct traps. */
+	pages->state.guest_tss.esp1 = lg->esp1;
+	pages->state.guest_tss.ss1 = lg->ss1;
+
+	/* CR3 */
+	pages->state.host_cr3 = __pa(current->mm->pgd);
+
+	/* Copy direct trap entries. */
+	copy_traps(lg, pages->state.guest_idt, lguest_default_idt_entries());
+
+	/* Copy all GDT entries but the TSS. */
+	copy_gdt(lg, pages->state.guest_gdt);
+}
+
+static void copy_out_guest_info(struct lguest *lg,
+				const struct lguest_pages *pages)
+{
+	/* We just want the regs back. */
+	lg->regs = pages->regs;
 }
 
 int run_guest(struct lguest *lg, char *__user user)
 {
-	struct lguest_regs *regs = &lg->state->regs;
+	struct lguest_regs *regs = &lg->regs;
 
 	while (!lg->dead) {
 		unsigned int cr2 = 0; /* Damn gcc */
+		struct lguest_pages *pages;
 
 		/* Hypercalls first: we might have been out to userspace */
 		if (do_async_hcalls(lg))
@@ -300,25 +321,16 @@ int run_guest(struct lguest *lg, char *_
 			continue;
 		}
 
-		/* Restore limits on TLS segments if in user mode. */
-		if (usermode(regs)) {
-			unsigned int i;
-			for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++)
-				lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a
-					|= lg->tls_limits[i];
-		}
-
 		local_irq_disable();
-		map_trap_page(lg);
-
-		/* Host state to be restored after the guest returns. */
-		asm("sidt %0":"=m"(lg->state->host.idt));
-		lg->state->host.gdt = __get_cpu_var(cpu_gdt_descr);
 
 		/* Even if *we* don't want FPU trap, guest might... */
 		set_ts(lg->ts);
 
-		run_guest_once(lg);
+		pages = lguest_pages(raw_smp_processor_id());
+		map_hypervisor_in_guest(lg);
+		copy_in_guest_info(pages, lg);
+		run_guest_once(lg, pages);
+		copy_out_guest_info(lg, pages);
 
 		/* Save cr2 now if we page-faulted. */
 		if (regs->trapnum == 14)
@@ -332,14 +344,7 @@ int run_guest(struct lguest *lg, char *_
 			if (regs->errcode == 0) {
 				if (emulate_insn(lg))
 					continue;
-
-				/* FIXME: If it's reloading %gs in a loop? */
-				if (usermode(regs) && new_gfp_eip(lg,regs))
-					continue;
 			}
-
-			if (reflect_trap(lg, &lg->gpf_trap, 1))
-				continue;
 			break;
 		case 14: /* We've intercepted a page fault. */
 			if (demand_page(lg, cr2, regs->errcode & 2))
@@ -347,30 +352,24 @@ int run_guest(struct lguest *lg, char *_
 
 			/* If lguest_data is NULL, this won't hurt. */
 			put_user(cr2, &lg->lguest_data->cr2);
-			if (reflect_trap(lg, &lg->page_trap, 1))
-				continue;
-			kill_guest(lg, "unhandled page fault at %#x"
-				   " (eip=%#x, errcode=%#x)",
-				   cr2, regs->eip, regs->errcode);
 			break;
 		case 7: /* We've intercepted a Device Not Available fault. */
 			/* If they don't want to know, just absorb it. */
 			if (!lg->ts)
 				continue;
-			if (reflect_trap(lg, &lg->fpu_trap, 0))
-				continue;
-			kill_guest(lg, "unhandled FPU fault at %#x",
-				   regs->eip);
 			break;
 		case 32 ... 255: /* Real interrupt, fall thru */
 			cond_resched();
 		case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
 			continue;
-		case 6: /* Invalid opcode before they installed handler */
-			check_bug_kill(lg);
 		}
-		kill_guest(lg,"unhandled trap %i at %#x (err=%i)",
-			   regs->trapnum, regs->eip, regs->errcode);
+
+		if (deliver_trap(lg, regs->trapnum))
+			continue;
+
+		kill_guest(lg, "unhandled trap %i at %#x (%#x)",
+			   regs->trapnum, regs->eip,
+			   regs->trapnum == 14 ? cr2 : regs->errcode);
 	}
 	return -ENOENT;
 
@@ -380,8 +379,6 @@ pending_dma:
 	return sizeof(unsigned long)*2;
 }
 
-#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->elem)
-
 static void adjust_pge(void *on)
 {
 	if (on)
@@ -401,7 +398,7 @@ static int __init init(void)
 	if (err)
 		return err;
 
-	err = init_pagetables(hype_page, HYPERVISOR_PAGES);
+	err = init_pagetables(hype_page, SHARED_HYPERVISOR_PAGES);
 	if (err) {
 		unmap_hypervisor();
 		return err;
diff -r 7a963f6eef0a arch/i386/lguest/hypercalls.c
--- a/arch/i386/lguest/hypercalls.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/hypercalls.c	Thu Mar 08 17:21:16 2007 +1100
@@ -32,8 +32,8 @@ static void guest_set_stack(struct lgues
 		kill_guest(lg, "bad stack segment %i", seg);
 	if (pages > 2)
 		kill_guest(lg, "bad stack pages %u", pages);
-	lg->state->tss.ss1 = seg;
-	lg->state->tss.esp1 = esp;
+	lg->ss1 = seg;
+	lg->esp1 = esp;
 	lg->stack_pages = pages;
 	pin_stack_pages(lg);
 }
diff -r 7a963f6eef0a arch/i386/lguest/hypervisor.S
--- a/arch/i386/lguest/hypervisor.S	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/hypervisor.S	Fri Mar 09 12:56:33 2007 +1100
@@ -1,39 +1,46 @@
-/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch.
-   Layout is: default_idt_entries (1k), then switch_to_guest entry point. */
+/* This code sits at 0xFFC00000 to do the low-level guest<->host switch.
+
+   There is are two pages above us for this CPU (struct lguest_pages).
+   The second page (struct lguest_ro_state) becomes read-only after the
+   context switch.  The first page (the stack for traps) remains writable,
+   but while we're in here, the guest cannot be running.
+*/
 #include <linux/linkage.h>
 #include <asm/asm-offsets.h>
 #include "lg.h"
 
 .text
 ENTRY(_start) /* ld complains unless _start is defined. */
-/* %eax contains ptr to target guest state, %edx contains host idt.
-   %ebx contains cr3 value.  All normal registers can be clobbered! */
+
+/* %eax points to lguest pages for this CPU.  %ebx contains cr3 value.
+   All normal registers can be clobbered! */
 switch_to_guest:
+	/* Save host segments on host stack. */
 	pushl	%es
 	pushl	%ds
+	pushl	%gs
 	pushl	%fs
-	pushl	%gs
-	pushl	%edx
+	/* We want %eax in deliver_to_host */
+	pushl	%eax
+	/* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
 	pushl	%ebp
-	/* Save old stack, switch to guest's stack. */
-	movl	%esp, LGUEST_STATE_host_stackptr(%eax)
-	movl	%eax, %esp
-	/* Guest registers will be at: %esp-$LGUEST_STATE_regs */
-	addl	$LGUEST_STATE_regs, %esp
+	/* Save host stack. */
+	movl	%esp, LGUEST_PAGES_host_sp(%eax)
+	/* Switch to guest stack: if we get NMI we expect to be there. */
+	movl	%eax, %edx
+	addl	$LGUEST_PAGES_regs, %edx
+	movl	%edx, %esp
 	/* Switch to guest's GDT, IDT. */
-	lgdt	LGUEST_STATE_gdt(%eax)
-	lidt	LGUEST_STATE_idt(%eax)
-	/* Save page table top. */
-	movl	%cr3, %ecx
-	movl	%ecx, LGUEST_STATE_host_pgdir(%eax)
-	/* Set host's TSS to available (clear byte 5 bit 2). */
-	movl	(LGUEST_STATE_host_gdt+2)(%eax), %ecx
-	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%ecx)
-	/* Switch to guest page tables */
-	movl	%ebx, %cr3
-	/* Switch to guest's TSS. */
+	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
+	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
+	/* Switch to guest's TSS while GDT still writable. */
 	movl	$(GDT_ENTRY_TSS*8), %edx
 	ltr	%dx
+	/* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
+	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
+	/* Switch to guest page tables:	lguest_pages->state now read-only. */
+	movl	%ebx, %cr3
 	/* Restore guest regs */
 	popl	%ebx
 	popl	%ecx
@@ -42,11 +49,6 @@ switch_to_guest:
 	popl	%edi
 	popl	%ebp
 	popl	%gs
-	/* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */
-	addl	$(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax
-	movw	$0,(%eax)
-	movw	$0,8(%eax)
-	movw	$0,16(%eax)
 	popl	%eax
 	popl	%fs
 	popl	%ds
@@ -71,28 +73,27 @@ switch_to_guest:
 	/* Load lguest ds segment for convenience. */			\
 	movl	$(LGUEST_DS), %eax;					\
 	movl	%eax, %ds;						\
-	/* Now figure out who we are */					\
+	/* Figure out where we are, based on stack (at top of regs). */	\
 	movl	%esp, %eax;						\
-	subl	$LGUEST_STATE_regs, %eax;				\
-	/* Switch to host page tables (GDT, IDT and stack are in host   \
+	subl	$LGUEST_PAGES_regs, %eax;				\
+	/* Switch to host page tables (host GDT, IDT and stack are in host   \
 	   mem, so need this first) */					\
-	movl	LGUEST_STATE_host_pgdir(%eax), %ebx;			\
-	movl	%ebx, %cr3;						\
+	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
+	movl	%edx, %cr3;						\
 	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
-	andb	$0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\
+	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
 	/* Switch to host's GDT & IDT. */				\
-	lgdt	LGUEST_STATE_host_gdt(%eax);				\
-	lidt	LGUEST_STATE_host_idt(%eax);				\
+	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
+	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
 	/* Switch to host's stack. */					\
-	movl	LGUEST_STATE_host_stackptr(%eax), %esp;			\
+	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
 	/* Switch to host's TSS */					\
-	movl	$(GDT_ENTRY_TSS*8), %ebx;				\
-	ltr	%bx;							\
-	/* Restore host regs */						\
+	movl	$(GDT_ENTRY_TSS*8), %eax;				\
+	ltr	%ax;							\
 	popl	%ebp;							\
-	popl	%edx;							\
+	popl	%eax;							\
+	popl	%fs;							\
 	popl	%gs;							\
-	popl	%fs;							\
 	popl	%ds;							\
 	popl	%es
 
@@ -106,8 +107,8 @@ decode_idt_and_jmp:
 decode_idt_and_jmp:
 	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
 	 * will return to run_guest_once.  This is a feature. */
-	/* We told gcc we'd clobber edx and eax... */
-	movl	LGUEST_STATE_trapnum(%eax), %eax
+	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
+	movl	LGUEST_PAGES_regs_trapnum(%eax), %eax
 	leal	(%edx,%eax,8), %eax
 	movzwl	(%eax),%edx
 	movl	4(%eax), %eax
@@ -115,9 +116,10 @@ decode_idt_and_jmp:
 	orl	%eax, %edx
 	jmp	*%edx
 
+/* FIXME: NMI needs something completely different.  Don't SWITCH_TO_HOST. */
 deliver_to_host_with_errcode:
 	SWITCH_TO_HOST
-	pushl	LGUEST_STATE_errcode(%eax)
+	pushl	LGUEST_PAGES_regs_errcode(%eax)
 	jmp decode_idt_and_jmp
 
 /* Real hardware interrupts are delivered straight to the host.  Others
diff -r 7a963f6eef0a arch/i386/lguest/interrupts_and_traps.c
--- a/arch/i386/lguest/interrupts_and_traps.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/interrupts_and_traps.c	Fri Mar 09 12:56:33 2007 +1100
@@ -6,19 +6,31 @@ static void push_guest_stack(struct lgue
 	lhwrite_u32(lg, (u32)--(*gstack), val);
 }
 
-int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err)
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+static int idt_type(u32 lo, u32 hi)
+{
+	return (hi >> 8) & 0xF;
+}
+
+static int idt_present(u32 lo, u32 hi)
+{
+	return (hi & 0x8000);
+}
+
+static void reflect_trap(struct lguest *lg, u32 lo, u32 hi, int has_err)
 {
 	u32 __user *gstack;
 	u32 eflags, ss, irq_enable;
-	struct lguest_regs *regs = &lg->state->regs;
-
-	if (!trap->addr)
-		return 0;
+	struct lguest_regs *regs = &lg->regs;
 
 	/* If they want a ring change, we use new stack and push old ss/esp */
 	if ((regs->ss&0x3) != GUEST_DPL) {
-		gstack = (u32 __user *)guest_pa(lg, lg->state->tss.esp1);
-		ss = lg->state->tss.ss1;
+		gstack = (u32 __user *)guest_pa(lg, lg->esp1);
+		ss = lg->ss1;
 		push_guest_stack(lg, &gstack, regs->ss);
 		push_guest_stack(lg, &gstack, regs->esp);
 	} else {
@@ -43,21 +55,18 @@ int reflect_trap(struct lguest *lg, cons
 	regs->ss = ss;
 	regs->esp = (u32)gstack + lg->page_offset;
 	regs->cs = (__KERNEL_CS|GUEST_DPL);
-	regs->eip = trap->addr;
-
-	/* GS will be neutered on way back to guest. */
-	put_user(0, &lg->lguest_data->gs_gpf_eip);
+	regs->eip = idt_address(lo, hi);
 
 	/* Disable interrupts for an interrupt gate. */
-	if (trap->disable_interrupts)
+	if (idt_type(lo, hi) == 0xE)
 		put_user(0, &lg->lguest_data->irq_enabled);
-	return 1;
 }
 
 void maybe_do_interrupt(struct lguest *lg)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(irqs, LGUEST_IRQS);
+	struct desc_struct *idt;
 
 	if (!lg->lguest_data)
 		return;
@@ -87,20 +96,36 @@ void maybe_do_interrupt(struct lguest *l
 			return;
 	}
 
-	if (lg->interrupt[irq].addr != 0) {
+	idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
+	if (idt_present(idt->a, idt->b)) {
 		clear_bit(irq, lg->irqs_pending);
-		reflect_trap(lg, &lg->interrupt[irq], 0);
-	}
+		reflect_trap(lg, idt->a, idt->b, 0);
+	}
+}
+
+static int has_err(unsigned int trap)
+{
+	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+int deliver_trap(struct lguest *lg, unsigned int num)
+{
+	u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
+
+	if (!idt_present(lo, hi))
+		return 0;
+	reflect_trap(lg, lo, hi, has_err(num));
+	return 1;
 }
 
 void check_bug_kill(struct lguest *lg)
 {
 #ifdef CONFIG_BUG
-	u32 eip = lg->state->regs.eip - PAGE_OFFSET;
+	u32 eip = lg->regs.eip - PAGE_OFFSET;
 	u16 insn;
 
 	/* This only works for addresses in linear mapping... */
-	if (lg->state->regs.eip < PAGE_OFFSET)
+	if (lg->regs.eip < PAGE_OFFSET)
 		return;
 	lhread(lg, &insn, eip, sizeof(insn));
 	if (insn == 0x0b0f) {
@@ -120,111 +145,125 @@ void check_bug_kill(struct lguest *lg)
 #endif	/* CONFIG_BUG */
 }
 
-static void copy_trap(struct lguest *lg,
-		      struct host_trap *trap,
-		      const struct desc_struct *desc)
-{
-	u8 type = ((desc->b >> 8) & 0xF);
-
-	/* Not present? */
-	if (!(desc->b & 0x8000)) {
-		trap->addr = 0;
-		return;
-	}
+static int direct_trap(const struct lguest *lg,
+		       const struct desc_struct *trap,
+		       unsigned int num)
+{
+	/* Hardware interrupts don't go to guest (except syscall). */
+	if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
+		return 0;
+
+	/* We intercept page fault (demand shadow paging & cr2 saving)
+	   protection fault (in/out emulation) and device not
+	   available (TS handling), and hypercall */
+	if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
+		return 0;
+
+	/* Interrupt gates (0xE) or not present (0x0) can't go direct. */
+	return idt_type(trap->a, trap->b) == 0xF;
+}
+
+/* Set up trap in IDT. */
+static void set_trap(struct lguest *lg, struct desc_struct *trap,
+		     unsigned int num, u32 lo, u32 hi)
+{
+	u8 type = idt_type(lo, hi);
+
+	if (!idt_present(lo, hi)) {
+		trap->a = trap->b = 0;
+		return;
+	}
+
 	if (type != 0xE && type != 0xF)
 		kill_guest(lg, "bad IDT type %i", type);
-	trap->disable_interrupts = (type == 0xE);
-	trap->addr = ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000));
-}
-
-/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
-static u8 tramp[]
-= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
-    0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
-    /* movl 0, %ss:lguest_data.gs_gpf_eip */
-    0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
-};
-#define TRAMP_MOVL_TARGET_OFF 7
-#define TRAMP_JMP_TARGET_OFF 16
-
-static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
-{
-	u32 addr, off;
-
-	off = sizeof(tramp)*i;
-	memcpy(lg->trap_page + off, tramp, sizeof(tramp));
-
-	/* 0 is to be placed in lguest_data.gs_gpf_eip. */
-	addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
-	memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
-
-	/* Address is relative to where end of jmp will be. */
-	addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
-	memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
-	return (-4*1024*1024) + off;
-}
-
-/* We bounce through the trap page, for two reasons: firstly, we need
-   the interrupt destination always mapped, to avoid double faults,
-   secondly we want to reload %gs to make it innocuous on entering kernel.
- */
-static void setup_idt(struct lguest *lg,
-		      unsigned int i,
-		      const struct desc_struct *desc)
-{
-	u8 type = ((desc->b >> 8) & 0xF);
-	u32 taddr;
-
-	/* Not present? */
-	if (!(desc->b & 0x8000)) {
-		/* FIXME: When we need this, we'll know... */
-		if (lg->state->idt_table[i].a & 0x8000)
-			kill_guest(lg, "removing interrupts not supported");
-		return;
-	}
-
-	/* We could reflect and disable interrupts, but guest can do itself. */
-	if (type != 0xF)
-		kill_guest(lg, "bad direct IDT %i type %i", i, type);
-
-	taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000));
-
-	lg->state->idt_table[i].a = (((__KERNEL_CS|GUEST_DPL)<<16)
-					| (taddr & 0x0000FFFF));
-	lg->state->idt_table[i].b = (desc->b&0xEF00)|(taddr&0xFFFF0000);
-}
-
-void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 high)
-{
-	struct desc_struct d = { low, high };
-
-	switch (i) {
-	/* Ignore NMI, doublefault, hypercall, spurious interrupt. */
-	case 2:
-	case 8:
-	case 15:
-	case LGUEST_TRAP_ENTRY:
-	/* FIXME: We should handle debug and int3 */
-	case 1:
-	case 3:
-		return;
-	/* We intercept page fault, general protection fault and fpu missing */
-	case 13:
-		copy_trap(lg, &lg->gpf_trap, &d);
-		return;
-	case 14:
-		copy_trap(lg, &lg->page_trap, &d);
-		return;
-	case 7:
-		copy_trap(lg, &lg->fpu_trap, &d);
-		return;
-	}
-
-	/* Other traps go straight to guest. */
-	if (i < FIRST_EXTERNAL_VECTOR || i == SYSCALL_VECTOR)
-		setup_idt(lg, i, &d);
-	/* A virtual interrupt */
-	else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS)
-		copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d);
-}
-
+
+	trap->a = ((__KERNEL_CS|GUEST_DPL)<<16) | (lo&0x0000FFFF);
+	trap->b = (hi&0xFFFFEF00);
+
+	/* Make sure trap address is available so we don't fault.  In
+	 * theory, it could overlap two pages, in practice it's aligned. */
+	if (direct_trap(lg, trap, num))
+		pin_page(lg, idt_address(lo, hi));
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		pin_page(lg, lg->esp1 - i * PAGE_SIZE);
+}
+
+/* We need to ensure all the direct trap pages are mapped after we
+ * clear shadow mappings. */
+void pin_trap_pages(struct lguest *lg)
+{
+	unsigned int i;
+	struct desc_struct *trap;
+
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+		trap = &lg->idt[i];
+		if (direct_trap(lg, trap, i))
+			pin_page(lg, idt_address(trap->a, trap->b));
+	}
+
+	trap = &lg->syscall_idt;
+	if (direct_trap(lg, trap, SYSCALL_VECTOR))
+		pin_page(lg, idt_address(trap->a, trap->b));
+	pin_stack_pages(lg);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+{
+	/* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
+	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
+		return;
+
+	if (num < ARRAY_SIZE(lg->idt))
+		set_trap(lg, &lg->idt[num], num, lo, hi);
+	else if (num == SYSCALL_VECTOR)
+		set_trap(lg, &lg->syscall_idt, num, lo, hi);
+}
+
+static void default_idt_entry(struct desc_struct *idt,
+			      int trap,
+			      const unsigned long def)
+{
+	u32 flags = 0x8e00;
+
+	/* They can't "int" into any of them except hypercall. */
+	if (trap == LGUEST_TRAP_ENTRY)
+		flags |= (GUEST_DPL << 13);
+
+	idt->a = (LGUEST_CS<<16) | (def&0x0000FFFF);
+	idt->b = (def&0xFFFF0000) | flags;
+}
+
+void setup_default_idt_entries(struct lguest_ro_state *state, 
+			       const unsigned long *def)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+		default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+		const unsigned long *def)
+{
+	unsigned int i;
+
+	/* All hardware interrupts are same whatever the guest: only the
+	 * traps might be different. */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+		if (direct_trap(lg, &lg->idt[i], i))
+			idt[i] = lg->idt[i];
+		else
+			default_idt_entry(&idt[i], i, def[i]);
+	}
+	i = SYSCALL_VECTOR;
+	if (direct_trap(lg, &lg->syscall_idt, i))
+		idt[i] = lg->syscall_idt;
+	else
+		default_idt_entry(&idt[i], i, def[i]);
+}
diff -r 7a963f6eef0a arch/i386/lguest/lg.h
--- a/arch/i386/lguest/lg.h	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/lg.h	Fri Mar 09 13:00:01 2007 +1100
@@ -45,13 +45,6 @@ __init int init_pagetables(struct page *
 #define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
 #define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
 
-/* Simplified version of IDT. */
-struct host_trap
-{
-	unsigned long addr;
-	int disable_interrupts;
-};
-
 struct lguest_dma_info
 {
 	struct list_head list;
@@ -69,10 +62,66 @@ struct pgdir
 	u32 *pgdir;
 };
 
+/* Hardware-defined TSS structure. */
+struct x86_tss
+{
+	unsigned short	back_link,__blh;
+	unsigned long	esp0;
+	unsigned short	ss0,__ss0pad;
+	unsigned long	esp1;
+	unsigned short	ss1,__ss1pad;
+	unsigned long	esp2;
+	unsigned short	ss2,__ss2pad;
+	unsigned long	cr3;
+	unsigned long	eip;
+	unsigned long	eflags;
+	unsigned long	eax,ecx,edx,ebx;
+	unsigned long	esp;
+	unsigned long	ebp;
+	unsigned long	esi;
+	unsigned long	edi;
+	unsigned short	es, __espad;
+	unsigned short	cs, __cspad;
+	unsigned short	ss, __sspad;
+	unsigned short	ds, __dspad;
+	unsigned short	fs, __fspad;
+	unsigned short	gs, __gspad;
+	unsigned short	ldt, __ldtpad;
+	unsigned short	trace, io_bitmap_base;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lguest_ro_state
+{
+	/* Host information we need to restore when we switch back. */
+	u32 host_cr3;
+	struct Xgt_desc_struct host_idt_desc;
+	struct Xgt_desc_struct host_gdt_desc;
+	u32 host_sp;
+
+	/* Fields which are used when guest is running. */
+	struct Xgt_desc_struct guest_idt_desc;
+	struct Xgt_desc_struct guest_gdt_desc;
+	struct x86_tss guest_tss;
+	struct desc_struct guest_idt[IDT_ENTRIES];
+	struct desc_struct guest_gdt[GDT_ENTRIES];
+};
+
+/* We have two pages shared with guests, per cpu.  */
+struct lguest_pages
+{
+	/* This is the stack page mapped rw in guest */
+	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
+	struct lguest_regs regs;
+
+	/* This is the host state & guest descriptor page, ro in guest */
+	struct lguest_ro_state state;
+} __attribute__((aligned(PAGE_SIZE)));
+
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
-	struct lguest_state *state;
+	struct lguest_regs regs;
 	struct lguest_data __user *lguest_data;
 	struct task_struct *tsk;
 	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
@@ -84,15 +133,14 @@ struct lguest
 	int timer_on;
 	int halted;
 	int ts;
-	u32 gpf_eip;
 	u32 last_timer;
 	u32 next_hcall;
-	u16 tls_limits[GDT_ENTRY_TLS_ENTRIES];
+	u32 esp1;
+	u8 ss1;
 
 	/* We keep a small number of these. */
 	u32 pgdidx;
 	struct pgdir pgdirs[4];
-	void *trap_page;
 
 	/* Cached wakeup: we hold a reference to this task. */
 	struct task_struct *wake;
@@ -109,14 +157,15 @@ struct lguest
 	/* Dead? */
 	const char *dead;
 
-	/* We intercept page fault (demand shadow paging & cr2 saving)
-	   protection fault (in/out emulation, TLS handling) and
-	   device not available (TS handling). */
-	struct host_trap page_trap, gpf_trap, fpu_trap;
-
-	/* Virtual interrupts */
+	/* The GDT entries copied into lguest_ro_state when running. */
+	struct desc_struct gdt[GDT_ENTRIES];
+
+	/* The IDT entries: some copied into lguest_ro_state when running. */
+	struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
+	struct desc_struct syscall_idt;
+
+	/* Pending virtual interrupts */
 	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-	struct host_trap interrupt[LGUEST_IRQS];
 };
 
 extern struct lguest lguests[];
@@ -125,7 +174,6 @@ extern struct mutex lguest_lock;
 /* core.c: */
 /* Entry points in hypervisor */
 const unsigned long *__lguest_default_idt_entries(void);
-struct lguest_state *__lguest_states(void);
 u32 lhread_u32(struct lguest *lg, u32 addr);
 void lhwrite_u32(struct lguest *lg, u32 val, u32 addr);
 void lhread(struct lguest *lg, void *buf, u32 addr, unsigned bytes);
@@ -136,15 +184,24 @@ int find_free_guest(void);
 
 /* interrupts_and_traps.c: */
 void maybe_do_interrupt(struct lguest *lg);
-int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err);
+int deliver_trap(struct lguest *lg, unsigned int num);
 void check_bug_kill(struct lguest *lg);
 void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+void pin_stack_pages(struct lguest *lg);
+void pin_trap_pages(struct lguest *lg);
+void setup_default_idt_entries(struct lguest_ro_state *state, 
+			       const unsigned long *def);
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+		const unsigned long *def);
 
 /* segments.c: */
+void setup_default_gdt_entries(struct lguest_ro_state *state);
 void load_guest_gdt(struct lguest *lg, u32 table, u32 num);
 void guest_load_tls(struct lguest *lg,
 		    const struct desc_struct __user *tls_array);
-
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
+
+/* page_tables.c: */
 int init_guest_pagetable(struct lguest *lg, u32 pgtable);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lguest *lg, u32 pgtable);
@@ -153,12 +210,15 @@ void guest_pagetable_flush_user(struct l
 void guest_pagetable_flush_user(struct lguest *lg);
 void guest_set_pte(struct lguest *lg, unsigned long cr3,
 		   unsigned long vaddr, u32 val);
-void map_trap_page(struct lguest *info);
+void map_hypervisor_in_guest(struct lguest *lg);
 int demand_page(struct lguest *info, u32 cr2, int write);
-void pin_stack_pages(struct lguest *lg);
-
+void pin_page(struct lguest *lg, u32 addr);
+
+/* lguest_user.c: */
 int lguest_device_init(void);
 void lguest_device_remove(void);
+
+/* io.c: */
 void lguest_io_init(void);
 u32 bind_dma(struct lguest *lg,
 	     unsigned long addr, unsigned long udma, u16 numdmas,u8 interrupt);
@@ -167,8 +227,9 @@ void release_all_dma(struct lguest *lg);
 void release_all_dma(struct lguest *lg);
 unsigned long get_dma_buffer(struct lguest *lg, unsigned long addr,
 			     unsigned long *interrupt);
-
 void set_wakeup_process(struct lguest *lg, struct task_struct *p);
+
+/* hypercalls.c: */
 int do_async_hcalls(struct lguest *info);
 int hypercall(struct lguest *info, struct lguest_regs *regs);
 
@@ -185,65 +246,5 @@ static inline unsigned long guest_pa(str
 {
 	return vaddr - lg->page_offset;
 }
-
-/* Hardware-defined TSS structure. */
-struct x86_tss
-{
-	unsigned short	back_link,__blh;
-	unsigned long	esp0;
-	unsigned short	ss0,__ss0pad;
-	unsigned long	esp1;
-	unsigned short	ss1,__ss1pad;
-	unsigned long	esp2;
-	unsigned short	ss2,__ss2pad;
-	unsigned long	cr3;
-	unsigned long	eip;
-	unsigned long	eflags;
-	unsigned long	eax,ecx,edx,ebx;
-	unsigned long	esp; /* We actually use this one to save esp. */
-	unsigned long	ebp;
-	unsigned long	esi;
-	unsigned long	edi;
-	unsigned short	es, __espad;
-	unsigned short	cs, __cspad;
-	unsigned short	ss, __sspad;
-	unsigned short	ds, __dspad;
-	unsigned short	fs, __fspad;
-	unsigned short	gs, __gspad;
-	unsigned short	ldt, __ldtpad;
-	unsigned short	trace, io_bitmap_base;
-};
-
-int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
-		    struct lguest_regs *regs, struct x86_tss *tss);
-
-struct lguest_host_state
-{
-	struct Xgt_desc_struct	gdt;
-	struct Xgt_desc_struct	idt;
-	unsigned long		pgdir;
-	unsigned long		stackptr;
-};
-
-/* This sits in the high-mapped shim. */
-struct lguest_state
-{
-	/* Task struct. */
-	struct x86_tss tss;
-
-	/* Gate descriptor table. */
-	struct Xgt_desc_struct gdt;
-	struct desc_struct gdt_table[GDT_ENTRIES];
-
-	/* Interrupt descriptor table. */
-	struct Xgt_desc_struct idt;
-	struct desc_struct idt_table[IDT_ENTRIES];
-
-	/* Host state we store while the guest runs. */
-	struct lguest_host_state host;
-
-	/* This is the stack on which we push our regs. */
-	struct lguest_regs regs;
-};
 #endif	/* __ASSEMBLY__ */
 #endif	/* _LGUEST_H */
diff -r 7a963f6eef0a arch/i386/lguest/lguest_user.c
--- a/arch/i386/lguest/lguest_user.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/lguest_user.c	Fri Mar 09 12:56:33 2007 +1100
@@ -4,40 +4,9 @@
 #include <linux/fs.h>
 #include "lg.h"
 
-static struct lguest_state *setup_guest_state(unsigned int num,
-					      unsigned long start)
+static void setup_regs(struct lguest_regs *regs, unsigned long start)
 {
-	struct lguest_state *guest = &__lguest_states()[num];
-	unsigned int i;
-	const long *def = __lguest_default_idt_entries();
-	struct lguest_regs *regs;
-
-	guest->gdt_table[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	guest->gdt_table[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
-	guest->gdt.size = GDT_ENTRIES*8-1;
-	guest->gdt.address = (unsigned long)&guest->gdt_table;
-
-	/* Other guest's IDTs are initialized from default. */
-	guest->idt.size = 8 * IDT_ENTRIES;
-	guest->idt.address = (long)guest->idt_table;
-	for (i = 0; i < IDT_ENTRIES; i++) {
-		u32 flags = 0x8e00;
-
-		/* They can't "int" into any of them except hypercall. */
-		if (i == LGUEST_TRAP_ENTRY)
-			flags |= (GUEST_DPL << 13);
-
-		guest->idt_table[i].a = (LGUEST_CS<<16) | (def[i]&0x0000FFFF);
-		guest->idt_table[i].b = (def[i]&0xFFFF0000) | flags;
-	}
-
-	memset(&guest->tss, 0, sizeof(guest->tss));
-	guest->tss.ss0 = LGUEST_DS;
-	guest->tss.esp0 = (unsigned long)(guest+1);
-	guest->tss.io_bitmap_base = sizeof(guest->tss); /* No I/O for you! */
-
 	/* Write out stack in format lguest expects, so we can switch to it. */
-	regs = &guest->regs;
 	regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0;
 	regs->edi = LGUEST_MAGIC_EDI;
 	regs->ebp = LGUEST_MAGIC_EBP;
@@ -49,12 +18,6 @@ static struct lguest_state *setup_guest_
 	regs->cs = __KERNEL_CS|GUEST_DPL;
 	regs->eflags = 0x202; 	/* Interrupts enabled. */
 	regs->ss = __KERNEL_DS|GUEST_DPL;
-
-	if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table),
-			     &guest->regs, &guest->tss))
-		return NULL;
-
-	return guest;
 }
 
 /* + addr */
@@ -138,32 +101,18 @@ static int initialize(struct file *file,
 	lg->pfn_limit = args[0];
 	lg->page_offset = args[3];
 
-	lg->trap_page = (u32 *)get_zeroed_page(GFP_KERNEL);
-	if (!lg->trap_page) {
-		err = -ENOMEM;
-		goto release_guest;
-	}
-
 	err = init_guest_pagetable(lg, args[1]);
 	if (err)
-		goto free_trap_page;
+		goto release_guest;
 
-	lg->state = setup_guest_state(i, args[2]);
-	if (!lg->state) {
-		err = -ENOEXEC;
-		goto release_pgtable;
-	}
+	setup_regs(&lg->regs, args[2]);
+	lg->tsk = current;
+	lg->mm = get_task_mm(current);
 	mutex_unlock(&lguest_lock);
 
-	lg->tsk = current;
-	lg->mm = get_task_mm(current);
 	file->private_data = lg;
 	return sizeof(args);
 
-release_pgtable:
-	free_guest_pagetable(lg);
-free_trap_page:
-	free_page((long)lg->trap_page);
 release_guest:
 	memset(lg, 0, sizeof(*lg));
 unlock:
@@ -207,12 +156,10 @@ static int close(struct inode *inode, st
 
 	mutex_lock(&lguest_lock);
 	release_all_dma(lg);
-	free_page((long)lg->trap_page);
 	free_guest_pagetable(lg);
 	mmput(lg->mm);
 	if (lg->dead != (void *)1)
 		kfree(lg->dead);
-	memset(lg->state, 0, sizeof(*lg->state));
 	memset(lg, 0, sizeof(*lg));
 	mutex_unlock(&lguest_lock);
 	return 0;
diff -r 7a963f6eef0a arch/i386/lguest/page_tables.c
--- a/arch/i386/lguest/page_tables.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/page_tables.c	Fri Mar 09 12:56:33 2007 +1100
@@ -168,16 +168,10 @@ static int page_writable(struct lguest *
 	return (*pte & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
 
-void pin_stack_pages(struct lguest *lg)
-{
-	unsigned int i;
-	u32 stack = lg->state->tss.esp1;
-
-	for (i = 0; i < lg->stack_pages; i++) {
-		if (!page_writable(lg, stack - i * PAGE_SIZE)
-		    && !demand_page(lg, stack - i * PAGE_SIZE, 1))
-			kill_guest(lg, "bad stack page %i@%#x", i, stack);
-	}
+void pin_page(struct lguest *lg, u32 addr)
+{
+	if (!page_writable(lg, addr) && !demand_page(lg, addr, 0))
+		kill_guest(lg, "bad trap page %#x", addr);
 }
 
 static unsigned int find_pgdir(struct lguest *lg, u32 pgtable)
@@ -243,7 +237,7 @@ void guest_new_pagetable(struct lguest *
 	lg->pgdidx = newpgdir;
 	lg->cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
 	if (repin)
-		pin_stack_pages(lg);
+		pin_trap_pages(lg);
 }
 
 static void release_all_pagetables(struct lguest *lg)
@@ -259,7 +253,7 @@ void guest_pagetable_clear_all(struct lg
 void guest_pagetable_clear_all(struct lguest *lg)
 {
 	release_all_pagetables(lg);
-	pin_stack_pages(lg);
+	pin_trap_pages(lg);
 }
 
 static void do_set_pte(struct lguest *lg, int idx,
@@ -329,11 +323,9 @@ void free_guest_pagetable(struct lguest 
 }
 
 /* Caller must be preempt-safe */
-void map_trap_page(struct lguest *lg)
+void map_hypervisor_in_guest(struct lguest *lg)
 {
 	int cpu = smp_processor_id();
-
-	hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
 
 	/* Since hypervisor less that 4MB, we simply mug top pte page. */
 	lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
@@ -356,10 +348,18 @@ static __init void populate_hypervisor_p
 	u32 *pte = hypervisor_pte_page(cpu);
 
 	for (i = 0; i < pages; i++) {
-		/* First entry set dynamically in map_trap_page */
-		pte[i+1] = ((page_to_pfn(hype_page[i]) << PAGE_SHIFT) 
-			    | _PAGE_KERNEL_EXEC);
-	}
+		pte[i] = ((page_to_pfn(hype_page[i]) << PAGE_SHIFT) 
+			    | _PAGE_PRESENT|_PAGE_ACCESSED);
+	}
+
+	/* We only map this CPU's pages, so guest can't see others. */
+	i = pages + cpu*2;
+
+	/* First page (regs) is rw, second (state) is ro. */
+	pte[i] = ((page_to_pfn(hype_page[i]) << PAGE_SHIFT)
+		  | _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW);
+	pte[i+1] = ((page_to_pfn(hype_page[i+1]) << PAGE_SHIFT)
+		    | _PAGE_PRESENT|_PAGE_ACCESSED);
 }
 
 __init int init_pagetables(struct page **hype_page, int pages)
diff -r 7a963f6eef0a arch/i386/lguest/segments.c
--- a/arch/i386/lguest/segments.c	Thu Mar 08 17:01:08 2007 +1100
+++ b/arch/i386/lguest/segments.c	Fri Mar 09 12:56:33 2007 +1100
@@ -1,171 +1,114 @@
 #include "lg.h"
 
-/* Dealing with GDT entries is such a horror, I convert to sanity and back */
-struct decoded_gdt_entry
+static int desc_ok(const struct desc_struct *gdt)
 {
-	u32 base, limit;
-	union {
-		struct {
-			unsigned type:4;
-			unsigned dtype:1;
-			unsigned dpl:2;
-			unsigned present:1;
-			unsigned unused:4;
-			unsigned avl:1;
-			unsigned mbz:1;
-			unsigned def:1;
-			unsigned page_granularity:1;
-		};
-		u16 raw_attributes;
-	};
-};
-
-static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct *en)
-{
-	struct decoded_gdt_entry de;
-	de.base = ((en->a >> 16) | ((en->b & 0xff) << 16)
-		   | (en->b & 0xFF000000));
-	de.limit = ((en->a & 0xFFFF) | (en->b & 0xF0000));
-	de.raw_attributes = (en->b >> 8);
-	return de;
+	/* MBZ=0, P=1, DT=1  */
+	return ((gdt->b & 0x00209000) == 0x00009000);
 }
 
-static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry *de)
+static int segment_present(const struct desc_struct *gdt)
 {
-	struct desc_struct en;
-	en.a = ((de->limit & 0xFFFF) | (de->base << 16));
-	en.b = (((de->base >> 16) & 0xFF)
-		 | ((((u32)de->raw_attributes) & 0xF0FF) << 8)
-		 | (de->limit & 0xF0000)
-		 | (de->base & 0xFF000000));
-	return en;
+	return gdt->b & 0x8000;
 }
 
-static int check_desc(const struct decoded_gdt_entry *dec)
+static int ignored_gdt(unsigned int num)
 {
-	return (dec->mbz == 0 && dec->dtype == 1 && (dec->type & 4) == 0);
+	return (num == GDT_ENTRY_TSS
+		|| num == GDT_ENTRY_LGUEST_CS
+		|| num == GDT_ENTRY_LGUEST_DS
+		|| num == GDT_ENTRY_KERNEL_CS
+		|| num == GDT_ENTRY_KERNEL_DS
+		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
 }
 
-static void check_segment(const struct desc_struct *gdt, u32 *segreg)
+/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
+static void check_segment_use(struct lguest *lg, unsigned int desc)
 {
-	if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000))
-		*segreg = 0;
+	if (lg->regs.gs / 8 == desc)
+		lg->regs.gs = 0;
+	if (lg->regs.fs / 8 == desc)
+		lg->regs.fs = 0;
+	if (lg->regs.es / 8 == desc)
+		lg->regs.es = 0;
+	if (lg->regs.ds / 8 == desc
+	    || lg->regs.cs / 8 == desc
+	    || lg->regs.ss / 8 == desc)
+		kill_guest(lg, "Removed live GDT entry %u", desc);
 }
 
-/* Ensure our manually-loaded segment regs don't fault in switch_to_guest. */
-static void check_live_segments(const struct desc_struct *gdt,
-				struct lguest_regs *regs)
+static void fixup_gdt_table(struct lguest *lg)
 {
-	check_segment(gdt, &regs->es);
-	check_segment(gdt, &regs->ds);
-	check_segment(gdt, &regs->fs);
-	check_segment(gdt, &regs->gs);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(lg->gdt); i++) {
+		/* We never copy these ones to real gdt */
+		if (ignored_gdt(i))
+			continue;
+
+		/* We could fault in switch_to_guest if they are using
+		 * a removed segment. */
+		if (!segment_present(&lg->gdt[i])) {
+			check_segment_use(lg, i);
+			continue;
+		}
+
+		if (!desc_ok(&lg->gdt[i]))
+			kill_guest(lg, "Bad GDT descriptor %i", i);
+
+		/* DPL 0 presumably means "for use by guest". */
+		if ((lg->gdt[i].b & 0x00006000) == 0)
+			lg->gdt[i].b |= (GUEST_DPL << 13);
+
+		/* Set accessed bit, since gdt isn't writable. */
+		lg->gdt[i].b |= 0x00000100;
+	}
 }
 
-int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
-		    struct lguest_regs *regs, struct x86_tss *tss)
+void setup_default_gdt_entries(struct lguest_ro_state *state)
 {
-	unsigned int i;
-	struct decoded_gdt_entry dec;
+	struct desc_struct *gdt = state->guest_gdt;
+	unsigned long tss = (unsigned long)&state->guest_tss;
 
-	for (i = 0; i < num; i++) {
-		unsigned long base, length;
-
-		/* We override these ones, so we don't care what they give. */
-		if (i == GDT_ENTRY_TSS
-		    || i == GDT_ENTRY_LGUEST_CS
-		    || i == GDT_ENTRY_LGUEST_DS
-		    || i == GDT_ENTRY_DOUBLEFAULT_TSS)
-			continue;
-
-		dec = decode_gdt_entry(&gdt[i]);
-		if (!dec.present)
-			continue;
-
-		if (!check_desc(&dec))
-			return 0;
-
-		base = dec.base;
-		length = dec.limit + 1;
-		if (dec.page_granularity) {
-			base *= PAGE_SIZE;
-			length *= PAGE_SIZE;
-		}
-
-		/* Unacceptable base? */
-		if (base >= HYPE_ADDR)
-			return 0;
-
-		/* Wrap around or segment overlaps hypervisor mem? */
-		if (!length
-		    || base + length < base
-		    || base + length > HYPE_ADDR) {
-			/* Trim to edge of hypervisor. */
-			length = HYPE_ADDR - base;
-			if (dec.page_granularity)
-				dec.limit = (length / PAGE_SIZE) - 1;
-			else
-				dec.limit = length - 1;
-		}
-		if (dec.dpl == 0)
-			dec.dpl = GUEST_DPL;
-		gdt[i] = encode_gdt_entry(&dec);
-	}
-	check_live_segments(gdt, regs);
-
-	/* Now put in hypervisor data and code segments. */
+	/* Hypervisor segments. */
 	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
 	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
 
-	/* Finally, TSS entry */
-	dec.base = (unsigned long)tss;
-	dec.limit = sizeof(*tss)-1;
-	dec.type = 0x9;
-	dec.dtype = 0;
-	dec.def = 0;
-	dec.present = 1;
-	dec.mbz = 0;
-	dec.page_granularity = 0;
-	gdt[GDT_ENTRY_TSS] = encode_gdt_entry(&dec);
+	/* Guest data and code segments: modified to DPL 1.
+	 * We don't copy these from host for easy of setup. */
+	gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_DPL << 13);
+	gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_DPL << 13);
 
-	return 1;
+	/* This is the one which we *cannot* copy from guest, since tss
+	   is depended on this lguest_ro_state, ie. this cpu. */
+	gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
+	gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 
+		| ((tss >> 16) & 0x000000FF);
+}
+
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+{
+	unsigned int i;
+
+	for (i = 0; i < GDT_ENTRIES; i++)
+		if (!ignored_gdt(i))
+			gdt[i] = lg->gdt[i];
 }
 
 void load_guest_gdt(struct lguest *lg, u32 table, u32 num)
 {
-	if (num > GDT_ENTRIES)
+	if (num > ARRAY_SIZE(lg->gdt))
 		kill_guest(lg, "too many gdt entries %i", num);
 
-	lhread(lg, lg->state->gdt_table, table,
-	       num * sizeof(lg->state->gdt_table[0]));
-	if (!fixup_gdt_table(lg->state->gdt_table, num,
-			     &lg->state->regs, &lg->state->tss))
-		kill_guest(lg, "bad gdt table");
+	lhread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
+	fixup_gdt_table(lg);
 }
 
-/* We don't care about limit here, since we only let them use these in
- * usermode (where lack of USER bit in pagetable protects hypervisor mem).
- * However, we want to ensure it doesn't fault when loaded, since *we* are
- * the ones who will load it in switch_to_guest.
- */
 void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
 {
-	unsigned int i;
-	struct desc_struct *tls = &lg->state->gdt_table[GDT_ENTRY_TLS_MIN];
+	struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
 
 	lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-	for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) {
-		struct decoded_gdt_entry dec = decode_gdt_entry(&tls[i]);
-
-		if (!dec.present)
-			continue;
-
-		/* We truncate to one byte/page (depending on G bit) to neuter
-		   it, so ensure it's more than 1 page below trap page. */
-		tls[i].a &= 0xFFFF0000;
-		lg->tls_limits[i] = dec.limit;
-		if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE)
-			kill_guest(lg, "bad TLS descriptor %i", i);
-	}
-	check_live_segments(lg->state->gdt_table, &lg->state->regs);
+	fixup_gdt_table(lg);
 }
diff -r 7a963f6eef0a include/asm-i386/lguest.h
--- a/include/asm-i386/lguest.h	Thu Mar 08 17:01:08 2007 +1100
+++ b/include/asm-i386/lguest.h	Thu Mar 08 17:21:16 2007 +1100
@@ -59,9 +59,6 @@ struct lguest_data
 	/* Blocked interrupts. */
 	DECLARE_BITMAP(interrupts, LGUEST_IRQS);
 
-	/* Last (userspace) address we got a GPF & reloaded gs. */
-	unsigned int gs_gpf_eip;
-
 	/* Virtual address of page fault. */
 	unsigned long cr2;
 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 8/9] lguest: Optimize away copy in and out of per-cpu guest pages
  2007-03-09  3:23           ` [PATCH 7/9] lguest: use read-only pages rather than segments to protect high-mapped switcher Rusty Russell
@ 2007-03-09  3:30             ` Rusty Russell
  2007-03-09  3:32               ` [PATCH 9/9] lguest: don't crash host on NMI Rusty Russell
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

Rather than copy in IDT, GDT and TSS every time, we only need do it
when something has changed (ie. guest IDT/GDT/TSS has changed, or
guest has changed CPU, or CPU has just run another guest).

For the registers, we simply allocate them an entire page and map that
over the stack page in the guest.

This restores context switch speed to be comparable to the old
segment-using lguest.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 8286b7923a5b arch/i386/lguest/core.c
--- a/arch/i386/lguest/core.c	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/core.c	Fri Mar 09 13:09:48 2007 +1100
@@ -37,6 +37,7 @@ static struct {
 	unsigned short segment;
 } lguest_entry __attribute_used__;
 DEFINE_MUTEX(lguest_lock);
+static DEFINE_PER_CPU(struct lguest *, last_guest);
 
 /* FIXME: Make dynamic. */
 #define MAX_LGUEST_GUESTS 16
@@ -144,10 +145,10 @@ static int emulate_insn(struct lguest *l
 {
 	u8 insn;
 	unsigned int insnlen = 0, in = 0, shift = 0;
-	unsigned long physaddr = guest_pa(lg, lg->regs.eip);
+	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
 
 	/* This only works for addresses in linear mapping... */
-	if (lg->regs.eip < lg->page_offset)
+	if (lg->regs->eip < lg->page_offset)
 		return 0;
 	lhread(lg, &insn, physaddr, 1);
 
@@ -180,11 +181,11 @@ static int emulate_insn(struct lguest *l
 	if (in) {
 		/* Lower bit tells is whether it's a 16 or 32 bit access */
 		if (insn & 0x1)
-			lg->regs.eax = 0xFFFFFFFF;
+			lg->regs->eax = 0xFFFFFFFF;
 		else
-			lg->regs.eax |= (0xFFFF << shift);
-	}
-	lg->regs.eip += insnlen;
+			lg->regs->eax |= (0xFFFF << shift);
+	}
+	lg->regs->eip += insnlen;
 	return 1;
 }
 
@@ -260,36 +261,35 @@ static void run_guest_once(struct lguest
 		     : "memory", "%edx", "%ecx", "%edi", "%esi");
 }
 
-static void copy_in_guest_info(struct lguest_pages *pages,
-			       struct lguest *lg)
-{
-	/* Copy in regs. */
-	pages->regs = lg->regs;
-
-	/* TSS entries for direct traps. */
+static void copy_in_guest_info(struct lguest_pages *pages, struct lguest *lg)
+{
+	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+		__get_cpu_var(last_guest) = lg;
+		lg->last_pages = pages;
+		lg->changed = CHANGED_ALL;
+	}
+
+	/* These are pretty cheap, so we do them unconditionally. */
+	pages->state.host_cr3 = __pa(current->mm->pgd);
+	map_hypervisor_in_guest(lg, pages);
 	pages->state.guest_tss.esp1 = lg->esp1;
 	pages->state.guest_tss.ss1 = lg->ss1;
 
-	/* CR3 */
-	pages->state.host_cr3 = __pa(current->mm->pgd);
-
 	/* Copy direct trap entries. */
-	copy_traps(lg, pages->state.guest_idt, lguest_default_idt_entries());
+	if (lg->changed & CHANGED_IDT)
+		copy_traps(lg, pages->state.guest_idt,
+			   lguest_default_idt_entries());
 
 	/* Copy all GDT entries but the TSS. */
-	copy_gdt(lg, pages->state.guest_gdt);
-}
-
-static void copy_out_guest_info(struct lguest *lg,
-				const struct lguest_pages *pages)
-{
-	/* We just want the regs back. */
-	lg->regs = pages->regs;
+	if (lg->changed & CHANGED_GDT)
+		copy_gdt(lg, pages->state.guest_gdt);
+
+	lg->changed = 0;
 }
 
 int run_guest(struct lguest *lg, char *__user user)
 {
-	struct lguest_regs *regs = &lg->regs;
+	struct lguest_regs *regs = lg->regs;
 
 	while (!lg->dead) {
 		unsigned int cr2 = 0; /* Damn gcc */
@@ -327,10 +327,8 @@ int run_guest(struct lguest *lg, char *_
 		set_ts(lg->ts);
 
 		pages = lguest_pages(raw_smp_processor_id());
-		map_hypervisor_in_guest(lg);
 		copy_in_guest_info(pages, lg);
 		run_guest_once(lg, pages);
-		copy_out_guest_info(lg, pages);
 
 		/* Save cr2 now if we page-faulted. */
 		if (regs->trapnum == 14)
diff -r 8286b7923a5b arch/i386/lguest/hypervisor.S
--- a/arch/i386/lguest/hypervisor.S	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/hypervisor.S	Fri Mar 09 13:15:43 2007 +1100
@@ -76,6 +76,8 @@ switch_to_guest:
 	/* Figure out where we are, based on stack (at top of regs). */	\
 	movl	%esp, %eax;						\
 	subl	$LGUEST_PAGES_regs, %eax;				\
+	/* Put trap number in %ebx before we switch cr3 and lose it. */ \
+	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
 	/* Switch to host page tables (host GDT, IDT and stack are in host   \
 	   mem, so need this first) */					\
 	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
@@ -104,23 +106,15 @@ return_to_host:
 
 deliver_to_host:
 	SWITCH_TO_HOST
-decode_idt_and_jmp:
 	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
 	 * will return to run_guest_once.  This is a feature. */
 	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
-	movl	LGUEST_PAGES_regs_trapnum(%eax), %eax
-	leal	(%edx,%eax,8), %eax
+	leal	(%edx,%ebx,8), %eax
 	movzwl	(%eax),%edx
 	movl	4(%eax), %eax
 	xorw	%ax, %ax
 	orl	%eax, %edx
 	jmp	*%edx
-
-/* FIXME: NMI needs something completely different.  Don't SWITCH_TO_HOST. */
-deliver_to_host_with_errcode:
-	SWITCH_TO_HOST
-	pushl	LGUEST_PAGES_regs_errcode(%eax)
-	jmp decode_idt_and_jmp
 
 /* Real hardware interrupts are delivered straight to the host.  Others
    cause us to return to run_guest_once so it can decide what to do.  Note
@@ -154,7 +148,8 @@ default_idt_entries:
 default_idt_entries:
 .text
 	IRQ_STUBS 0 1 return_to_host		/* First two traps */
-	IRQ_STUB 2 deliver_to_host_with_errcode	/* NMI */
+/* FIXME: NMI needs something completely different.  Don't SWITCH_TO_HOST. */
+	IRQ_STUB 2 deliver_to_host		/* NMI */
 	IRQ_STUBS 3 31 return_to_host		/* Rest of traps */
 	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
 	IRQ_STUB 128 return_to_host		/* System call (overridden) */
diff -r 8286b7923a5b arch/i386/lguest/interrupts_and_traps.c
--- a/arch/i386/lguest/interrupts_and_traps.c	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/interrupts_and_traps.c	Fri Mar 09 13:09:48 2007 +1100
@@ -25,7 +25,7 @@ static void reflect_trap(struct lguest *
 {
 	u32 __user *gstack;
 	u32 eflags, ss, irq_enable;
-	struct lguest_regs *regs = &lg->regs;
+	struct lguest_regs *regs = lg->regs;
 
 	/* If they want a ring change, we use new stack and push old ss/esp */
 	if ((regs->ss&0x3) != GUEST_DPL) {
@@ -121,11 +121,11 @@ void check_bug_kill(struct lguest *lg)
 void check_bug_kill(struct lguest *lg)
 {
 #ifdef CONFIG_BUG
-	u32 eip = lg->regs.eip - PAGE_OFFSET;
+	u32 eip = lg->regs->eip - PAGE_OFFSET;
 	u16 insn;
 
 	/* This only works for addresses in linear mapping... */
-	if (lg->regs.eip < PAGE_OFFSET)
+	if (lg->regs->eip < PAGE_OFFSET)
 		return;
 	lhread(lg, &insn, eip, sizeof(insn));
 	if (insn == 0x0b0f) {
@@ -219,6 +219,7 @@ void load_guest_idt_entry(struct lguest 
 	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
 		return;
 
+	lg->changed |= CHANGED_IDT;
 	if (num < ARRAY_SIZE(lg->idt))
 		set_trap(lg, &lg->idt[num], num, lo, hi);
 	else if (num == SYSCALL_VECTOR)
diff -r 8286b7923a5b arch/i386/lguest/lg.h
--- a/arch/i386/lguest/lg.h	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/lg.h	Fri Mar 09 13:09:48 2007 +1100
@@ -118,10 +118,16 @@ struct lguest_pages
 	struct lguest_ro_state state;
 } __attribute__((aligned(PAGE_SIZE)));
 
+#define CHANGED_IDT		1
+#define CHANGED_GDT		2
+#define CHANGED_ALL	        3
+
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
-	struct lguest_regs regs;
+	/* At end of a page shared mapped over lguest_pages in guest.  */
+	unsigned long regs_page;
+	struct lguest_regs *regs;
 	struct lguest_data __user *lguest_data;
 	struct task_struct *tsk;
 	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
@@ -138,6 +144,10 @@ struct lguest
 	u32 esp1;
 	u8 ss1;
 
+	/* Bitmap of what has changed: see CHANGED_* above. */
+	int changed;
+	struct lguest_pages *last_pages;
+
 	/* We keep a small number of these. */
 	u32 pgdidx;
 	struct pgdir pgdirs[4];
@@ -210,7 +220,7 @@ void guest_pagetable_flush_user(struct l
 void guest_pagetable_flush_user(struct lguest *lg);
 void guest_set_pte(struct lguest *lg, unsigned long cr3,
 		   unsigned long vaddr, u32 val);
-void map_hypervisor_in_guest(struct lguest *lg);
+void map_hypervisor_in_guest(struct lguest *lg, struct lguest_pages *pages);
 int demand_page(struct lguest *info, u32 cr2, int write);
 void pin_page(struct lguest *lg, u32 addr);
 
diff -r 8286b7923a5b arch/i386/lguest/lguest_user.c
--- a/arch/i386/lguest/lguest_user.c	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/lguest_user.c	Fri Mar 09 13:09:48 2007 +1100
@@ -100,19 +100,28 @@ static int initialize(struct file *file,
 	lg->guestid = i;
 	lg->pfn_limit = args[0];
 	lg->page_offset = args[3];
+	lg->regs_page = get_zeroed_page(GFP_KERNEL);
+	if (!lg->regs_page) {
+		err = -ENOMEM;
+		goto release_guest;
+	}
+	lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
 
 	err = init_guest_pagetable(lg, args[1]);
 	if (err)
-		goto release_guest;
+		goto free_regs;
 
-	setup_regs(&lg->regs, args[2]);
+	setup_regs(lg->regs, args[2]);
 	lg->tsk = current;
 	lg->mm = get_task_mm(current);
+	lg->last_pages = NULL;
 	mutex_unlock(&lguest_lock);
 
 	file->private_data = lg;
 	return sizeof(args);
 
+free_regs:
+	free_page(lg->regs_page);
 release_guest:
 	memset(lg, 0, sizeof(*lg));
 unlock:
@@ -160,6 +169,7 @@ static int close(struct inode *inode, st
 	mmput(lg->mm);
 	if (lg->dead != (void *)1)
 		kfree(lg->dead);
+	free_page(lg->regs_page);
 	memset(lg, 0, sizeof(*lg));
 	mutex_unlock(&lguest_lock);
 	return 0;
diff -r 8286b7923a5b arch/i386/lguest/page_tables.c
--- a/arch/i386/lguest/page_tables.c	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/page_tables.c	Fri Mar 09 13:09:48 2007 +1100
@@ -99,7 +99,7 @@ static u32 get_pte(struct lguest *lg, u3
    swapped.  It'd be nice to have a callback when Linux wants to swap out. */
 
 /* We fault pages in, which allows us to update accessed/dirty bits.
- * Return NULL or the pte page. */
+ * Return true if we got page. */
 static int page_in(struct lguest *lg, u32 vaddr, unsigned flags)
 {
 	u32 gtop, gpte;
@@ -323,13 +323,17 @@ void free_guest_pagetable(struct lguest 
 }
 
 /* Caller must be preempt-safe */
-void map_hypervisor_in_guest(struct lguest *lg)
-{
-	int cpu = smp_processor_id();
+void map_hypervisor_in_guest(struct lguest *lg, struct lguest_pages *pages)
+{
+	u32 *hype_pte_page = __get_cpu_var(hypervisor_pte_pages);
 
 	/* Since hypervisor less that 4MB, we simply mug top pte page. */
 	lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
-		(__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL);
+		(__pa(hype_pte_page) | _PAGE_KERNEL);
+
+	/* Map our regs page over stack page. */
+	hype_pte_page[(unsigned long)pages / PAGE_SIZE % PTES_PER_PAGE]
+		= (__pa(lg->regs_page) | _PAGE_KERNEL);
 }
 
 static void free_hypervisor_pte_pages(void)
diff -r 8286b7923a5b arch/i386/lguest/segments.c
--- a/arch/i386/lguest/segments.c	Fri Mar 09 13:09:39 2007 +1100
+++ b/arch/i386/lguest/segments.c	Fri Mar 09 13:09:48 2007 +1100
@@ -24,15 +24,15 @@ static int ignored_gdt(unsigned int num)
 /* We don't allow removal of CS, DS or SS; it doesn't make sense. */
 static void check_segment_use(struct lguest *lg, unsigned int desc)
 {
-	if (lg->regs.gs / 8 == desc)
-		lg->regs.gs = 0;
-	if (lg->regs.fs / 8 == desc)
-		lg->regs.fs = 0;
-	if (lg->regs.es / 8 == desc)
-		lg->regs.es = 0;
-	if (lg->regs.ds / 8 == desc
-	    || lg->regs.cs / 8 == desc
-	    || lg->regs.ss / 8 == desc)
+	if (lg->regs->gs / 8 == desc)
+		lg->regs->gs = 0;
+	if (lg->regs->fs / 8 == desc)
+		lg->regs->fs = 0;
+	if (lg->regs->es / 8 == desc)
+		lg->regs->es = 0;
+	if (lg->regs->ds / 8 == desc
+	    || lg->regs->cs / 8 == desc
+	    || lg->regs->ss / 8 == desc)
 		kill_guest(lg, "Removed live GDT entry %u", desc);
 }
 
@@ -103,6 +103,7 @@ void load_guest_gdt(struct lguest *lg, u
 
 	lhread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
 	fixup_gdt_table(lg);
+	lg->changed |= CHANGED_GDT;
 }
 
 void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
@@ -111,4 +112,5 @@ void guest_load_tls(struct lguest *lg, c
 
 	lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
 	fixup_gdt_table(lg);
+	lg->changed |= CHANGED_GDT;
 }



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 9/9] lguest: don't crash host on NMI
  2007-03-09  3:30             ` [PATCH 8/9] lguest: Optimize away copy in and out of per-cpu guest pages Rusty Russell
@ 2007-03-09  3:32               ` Rusty Russell
  0 siblings, 0 replies; 11+ messages in thread
From: Rusty Russell @ 2007-03-09  3:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andi Kleen, lkml - Kernel Mailing List

"handle" NMI by ignoring it.  Can't have been important, right?  As the
lguest64 hackers explained, handling NMI is a PITA.  Now oprofile does
not crash machine.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 5beeb29ed3a3 arch/i386/lguest/hypervisor.S
--- a/arch/i386/lguest/hypervisor.S	Wed Feb 28 09:59:23 2007 +1100
+++ b/arch/i386/lguest/hypervisor.S	Wed Feb 28 09:59:47 2007 +1100
@@ -116,6 +116,11 @@ deliver_to_host:
 	orl	%eax, %edx
 	jmp	*%edx
 
+/* We ignore NMI and return. */
+handle_nmi:
+	addl	$8, %esp
+	iret
+
 /* Real hardware interrupts are delivered straight to the host.  Others
    cause us to return to run_guest_once so it can decide what to do.  Note
    that some of these are overridden by the guest to deliver directly, and
@@ -148,8 +153,7 @@ default_idt_entries:
 default_idt_entries:
 .text
 	IRQ_STUBS 0 1 return_to_host		/* First two traps */
-/* FIXME: NMI needs something completely different.  Don't SWITCH_TO_HOST. */
-	IRQ_STUB 2 deliver_to_host		/* NMI */
+	IRQ_STUB 2 handle_nmi			/* NMI */
 	IRQ_STUBS 3 31 return_to_host		/* Rest of traps */
 	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
 	IRQ_STUB 128 return_to_host		/* System call (overridden) */



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/9] lguest: block device speedup
  2007-03-09  3:05 [PATCH 1/9] lguest: block device speedup Rusty Russell
  2007-03-09  3:08 ` [PATCH 2/9] lguest: bridging support in example code Rusty Russell
@ 2007-03-09  7:51 ` Christoph Hellwig
  2007-03-09  8:09   ` Jens Axboe
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2007-03-09  7:51 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Andi Kleen, lkml - Kernel Mailing List, Jens Axboe

On Fri, Mar 09, 2007 at 02:05:24PM +1100, Rusty Russell wrote:
> diff -r fdc8cbc1fd61 drivers/block/lguest_blk.c
> --- a/drivers/block/lguest_blk.c	Thu Mar 08 13:35:39 2007 +1100
> +++ b/drivers/block/lguest_blk.c	Thu Mar 08 15:51:55 2007 +1100
> @@ -45,6 +45,16 @@ struct blockdev
>  	struct request *req;
>  };
>  
> +/* Jens gave me this nice helper to end all chunks of a request. */
> +static void end_entire_request(struct request *req, int uptodate)
> +{
> +	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
> +		BUG();
> +	add_disk_randomness(req->rq_disk);
> +	blkdev_dequeue_request(req);
> +	end_that_request_last(req, uptodate);
> +}

I think we really want this in common code, ll_rw_blk.c should have:

static int __end_request(struct request *req, int uptodate,
		unsigned int sectors)
{
        if (!end_that_request_first(req, uptodate, sectors)) {
		add_disk_randomness(req->rq_disk);
		blkdev_dequeue_request(req);
		end_that_request_last(req, uptodate);
		return 1;
	}
	return 0;
}

/* TODO: add kerneldoc comment */
/* XXX: should be called end_partial_request */
void end_request(struct request *req, int uptodate)
{
	__end_request(req, uptodate, req->hard_cur_sectors);
}

/* TODO: add kerneldoc comment */
void end_entired_request(struct request *req, int uptodate)
{
	if (!__end_request(req, uptodate, req->hard_nr_sectors))
		BUG();
}

the latter two maybe as inlines

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/9] lguest: block device speedup
  2007-03-09  7:51 ` [PATCH 1/9] lguest: block device speedup Christoph Hellwig
@ 2007-03-09  8:09   ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2007-03-09  8:09 UTC (permalink / raw)
  To: Christoph Hellwig, Rusty Russell, Andrew Morton, Andi Kleen,
	lkml - Kernel Mailing List

On Fri, Mar 09 2007, Christoph Hellwig wrote:
> On Fri, Mar 09, 2007 at 02:05:24PM +1100, Rusty Russell wrote:
> > diff -r fdc8cbc1fd61 drivers/block/lguest_blk.c
> > --- a/drivers/block/lguest_blk.c	Thu Mar 08 13:35:39 2007 +1100
> > +++ b/drivers/block/lguest_blk.c	Thu Mar 08 15:51:55 2007 +1100
> > @@ -45,6 +45,16 @@ struct blockdev
> >  	struct request *req;
> >  };
> >  
> > +/* Jens gave me this nice helper to end all chunks of a request. */
> > +static void end_entire_request(struct request *req, int uptodate)
> > +{
> > +	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
> > +		BUG();
> > +	add_disk_randomness(req->rq_disk);
> > +	blkdev_dequeue_request(req);
> > +	end_that_request_last(req, uptodate);
> > +}
> 
> I think we really want this in common code, ll_rw_blk.c should have:

Yeah I know, this is also what I wrote to Rusty when I reviewed this
code and made the suggestion. The kernel also needs a full sweep to get
rid of end_request() entirely, it's really a relic from ancient times.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2007-03-09  8:10 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-09  3:05 [PATCH 1/9] lguest: block device speedup Rusty Russell
2007-03-09  3:08 ` [PATCH 2/9] lguest: bridging support in example code Rusty Russell
2007-03-09  3:12   ` [PATCH 3/9] lguest: cleanup: allocate separate pages for switcher code Rusty Russell
2007-03-09  3:16     ` [PATCH 4/9] lguest: cleanup: clean up regs save/restore Rusty Russell
2007-03-09  3:17       ` [PATCH 5/9] lguest: documentation fixes Rusty Russell
2007-03-09  3:19         ` [PATCH 6/9] lguest: pin stack page optimization Rusty Russell
2007-03-09  3:23           ` [PATCH 7/9] lguest: use read-only pages rather than segments to protect high-mapped switcher Rusty Russell
2007-03-09  3:30             ` [PATCH 8/9] lguest: Optimize away copy in and out of per-cpu guest pages Rusty Russell
2007-03-09  3:32               ` [PATCH 9/9] lguest: don't crash host on NMI Rusty Russell
2007-03-09  7:51 ` [PATCH 1/9] lguest: block device speedup Christoph Hellwig
2007-03-09  8:09   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.