virtualization.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Unified lguest launcher
@ 2007-04-04 16:03 Glauber de Oliveira Costa
  2007-04-05  2:44 ` Rusty Russell
  0 siblings, 1 reply; 4+ messages in thread
From: Glauber de Oliveira Costa @ 2007-04-04 16:03 UTC (permalink / raw)
  To: Linux Kernel Mailing List, Rusty Russell, virtualization

[-- Attachment #1: Type: text/plain, Size: 487 bytes --]

This is a new version of the unified lguest launcher that applies to
the current tree. According to rusty's suggestion, I'm bothering less
to be able to load 32 bit kernels on 64-bit machines: changing the
launcher for such case would be the easy part! In the absence of
further objections, I'll commit it.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>

-- 
Glauber de Oliveira Costa.
"Free as in Freedom"

"The less confident you are, the more serious you have to act."

[-- Attachment #2: lguest_launcher.patch --]
[-- Type: text/x-patch, Size: 14728 bytes --]

Index: linux-2.6.20/Documentation/lguest/Makefile
===================================================================
--- linux-2.6.20.orig/Documentation/lguest/Makefile
+++ linux-2.6.20/Documentation/lguest/Makefile
@@ -1,12 +1,13 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
 
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+#we could uname -i, but it seems to return unknown on a bunch locations
+ARCH:=$(shell uname -m | sed s/i[3456]86/i386/)
+
 include ../../.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+include $(ARCH)/defines
 
 CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
-	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds -I$(ARCH)
 LDLIBS:=-lz
 
 all: lguest.lds lguest
Index: linux-2.6.20/Documentation/lguest/i386/defines
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/i386/defines
@@ -0,0 +1,4 @@
+# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
+# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+include ../../.config
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
Index: linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
@@ -0,0 +1,9 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+/* LGUEST_TOP_ADDRESS comes from the Makefile */
+#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP - 1024*1024
+
+#include "../../../include/linux/lguest_launcher.h"
+
+#endif
Index: linux-2.6.20/Documentation/lguest/lguest.c
===================================================================
--- linux-2.6.20.orig/Documentation/lguest/lguest.c
+++ linux-2.6.20/Documentation/lguest/lguest.c
@@ -33,7 +33,7 @@
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
+#include <lguest_defs.h>
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
@@ -64,7 +64,7 @@ struct device
 
 	/* Watch DMA to this key if handle_input non-NULL. */
 	unsigned long watch_key;
-	u32 (*handle_output)(int fd, const struct iovec *iov,
+	unsigned long (*handle_output)(int fd, const struct iovec *iov,
 			     unsigned int num, struct device *me);
 
 	/* Device-specific data. */
@@ -94,20 +94,29 @@ static void *map_zeroed_pages(unsigned l
 }
 
 /* Returns the entry point */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+
+static unsigned long map_elf(int elf_fd, const void *hdr, 
 			     unsigned long *page_offset)
 {
-	void *addr;
+#ifndef __x86_64__
+	const Elf32_Ehdr *ehdr = hdr;
 	Elf32_Phdr phdr[ehdr->e_phnum];
+#else
+	const Elf64_Ehdr *ehdr = hdr;
+	Elf64_Phdr phdr[ehdr->e_phnum];
+#endif
+	void *addr;
 	unsigned int i;
 
 	/* Sanity checks. */
 	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+	    || ((ehdr->e_machine != EM_386) &&
+		(ehdr->e_machine != EM_X86_64))
+	    || ehdr->e_phentsize != sizeof(phdr[0])
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(phdr[0]))
 		errx(1, "Malformed elf header");
 
+
 	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
 		err(1, "Seeking to program headers");
 	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
@@ -120,13 +129,17 @@ static unsigned long map_elf(int elf_fd,
 		if (phdr[i].p_type != PT_LOAD)
 			continue;
 
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+		verbose("Section %i: size %lu addr %p\n",
+		i, (unsigned long)phdr[i].p_memsz, (void *)phdr[i].p_paddr);
 
 		/* We expect linear address space. */
 		if (!*page_offset)
 			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+		else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+#ifdef __x86_64__
+			 && (phdr[i].p_vaddr != VSYSCALL_START)
+#endif
+			)
 			errx(1, "Page offset of section %i different", i);
 
 		/* We map everything private, writable. */
@@ -210,15 +223,18 @@ static unsigned long load_bzimage(int fd
 	errx(1, "Could not find kernel in bzImage");
 }
 
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+/* In x86_64 systems, page table hierarchy does not start at a common location,
+ * but rather, is indicated by a symbol called boot_level4_pgt. Due to this, we
+ * need the elf header to survive this function, to lookup for the pgdir */
+static unsigned long load_kernel(int fd, unsigned long *page_offset, void *ehdr)
 {
-	Elf32_Ehdr hdr;
+	Elf64_Ehdr *hdr = ehdr;
 
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+	if (read(fd, hdr, sizeof(*hdr)) != sizeof(*hdr))
 		err(1, "Reading kernel");
 
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr, page_offset);
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) == 0)
+		return map_elf(fd, ehdr, page_offset);
 
 	return load_bzimage(fd, page_offset);
 }
@@ -250,6 +266,7 @@ static inline unsigned long page_align(u
 	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
 }
 
+#ifndef __x86_64__
 static unsigned long setup_pagetables(unsigned long mem,
 				      unsigned long initrd_size,
 				      unsigned long page_offset)
@@ -285,6 +302,74 @@ static unsigned long setup_pagetables(un
 
 	return (unsigned long)pgdir;
 }
+#else
+static unsigned long find_pgt_symbol(int elf_fd, 
+				Elf64_Ehdr *ehdr, 
+				unsigned long page_offset)
+{
+	unsigned int i;
+	Elf64_Shdr sec[ehdr->e_shnum];
+	Elf64_Sym *syms;
+	char *strtab = NULL;
+	unsigned long pgdir_addr = 0;
+	unsigned long nsyms = 0;
+
+	/* Now process sections searching for boot page tables
+	 * Start by finding the symtab section */
+	if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+		err(1, "Seeking to section headers");
+	if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+		err(1, "Reading section headers");
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (sec[i].sh_type == SHT_SYMTAB) {
+			int ret = 0;
+			syms = malloc(sec[i].sh_size);
+			if (!syms)
+				err(1,"Not enough memory for symbol table");
+			ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to symbol table");
+			ret = read(elf_fd, syms, sec[i].sh_size);
+			if (ret != sec[i].sh_size)
+				err(1, "Reading symbol table");
+			nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+			/* symtab links to strtab. We use it to find symbol
+			 * names */
+			strtab = malloc(sec[sec[i].sh_link].sh_size);
+			if (!strtab)
+				err(1,"Not enough memory for string table");
+			ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to string table");
+			ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+			if (ret != sec[sec[i].sh_link].sh_size)
+				err(1, "Reading string table");
+			break;
+		}
+	}
+
+	/* We now have a pointer to the symtab, start searching for the symbol */
+	for (i = 0; i < nsyms; i++) {
+		if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+			continue;
+		if (!strcmp(BOOT_PGTABLE,
+				(char *)((u64)syms[i].st_name + strtab))) {
+			pgdir_addr = syms[i].st_value - page_offset;
+			break;
+		}
+	}
+
+	if (!pgdir_addr) {
+		errno = ESRCH; 
+		err(1,"Unable to find boot pgdir");
+	}
+
+	return pgdir_addr;
+}
+#endif
 
 static void concat(char *dst, char *args[])
 {
@@ -299,9 +384,10 @@ static void concat(char *dst, char *args
 	dst[len] = '\0';
 }
 
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(unsigned long pgdir, unsigned long start, 
+						unsigned long page_offset)
 {
-	u32 args[] = { LHREQ_INITIALIZE,
+	unsigned long args[] = { LHREQ_INITIALIZE,
 		       LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
 		       pgdir, start, page_offset };
 	int fd;
@@ -379,7 +465,8 @@ static void *_check_pointer(unsigned lon
 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
 
 /* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+static unsigned long *dma2iov(unsigned long dma, struct iovec iov[], 
+								unsigned *num)
 {
 	unsigned int i;
 	struct lguest_dma *udma;
@@ -396,12 +483,12 @@ static u32 *dma2iov(unsigned long dma, s
 	return &udma->used_len;
 }
 
-static u32 *get_dma_buffer(int fd, void *key,
+static unsigned long *get_dma_buffer(int fd, void *key,
 			   struct iovec iov[], unsigned int *num, u32 *irq)
 {
-	u32 buf[] = { LHREQ_GETDMA, (u32)key };
+	unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)key };
 	unsigned long udma;
-	u32 *res;
+	unsigned long *res;
 
 	udma = write(fd, buf, sizeof(buf));
 	if (udma == (unsigned long)-1)
@@ -415,7 +502,7 @@ static u32 *get_dma_buffer(int fd, void 
 
 static void trigger_irq(int fd, u32 irq)
 {
-	u32 buf[] = { LHREQ_IRQ, irq };
+	unsigned long buf[] = { LHREQ_IRQ, irq };
 	if (write(fd, buf, sizeof(buf)) != 0)
 		err(1, "Triggering irq %i", irq);
 }
@@ -443,7 +530,8 @@ struct console_abort
 /* We DMA input to buffer bound at start of console page. */
 static bool handle_console_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0; 
+	unsigned long *lenp;
 	int len;
 	unsigned int num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -487,14 +575,14 @@ static bool handle_console_input(int fd,
 	return true;
 }
 
-static u32 handle_console_output(int fd, const struct iovec *iov,
-				 unsigned num, struct device*dev)
+static unsigned long  handle_console_output(int fd, const struct iovec *iov,
+					 unsigned num, struct device*dev)
 {
 	return writev(STDOUT_FILENO, iov, num);
 }
 
-static u32 handle_tun_output(int fd, const struct iovec *iov,
-			     unsigned num, struct device *dev)
+static unsigned long  handle_tun_output(int fd, const struct iovec *iov,
+					     unsigned num, struct device *dev)
 {
 	/* Now we've seen output, we should warn if we can't get buffers. */
 	*(bool *)dev->priv = true;
@@ -508,7 +596,8 @@ static unsigned long peer_offset(unsigne
 
 static bool handle_tun_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0; 
+	unsigned long *lenp;
 	int len;
 	unsigned num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -534,11 +623,12 @@ static bool handle_tun_input(int fd, str
 	return true;
 }
 
-static u32 handle_block_output(int fd, const struct iovec *iov,
-			       unsigned num, struct device *dev)
+static unsigned long handle_block_output(int fd, const struct iovec *iov,
+				       unsigned num, struct device *dev)
 {
 	struct lguest_block_page *p = dev->mem;
-	u32 irq, *lenp;
+	u32 irq; 
+	unsigned long *lenp;
 	unsigned int len, reply_num;
 	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
 	off64_t device_len, off = (off64_t)p->sector * 512;
@@ -546,11 +636,14 @@ static u32 handle_block_output(int fd, c
 	device_len = *(off64_t *)dev->priv;
 
 	if (off >= device_len)
-		err(1, "Bad offset %llu vs %llu", off, device_len);
+		err(1, "Bad offset %llu vs %llu", 
+				(unsigned long long)off, 
+				(unsigned long long) device_len);
 	if (lseek64(dev->fd, off, SEEK_SET) != off)
 		err(1, "Bad seek to sector %i", p->sector);
 
-	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", 
+						(unsigned long long)off);
 
 	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
 	if (!lenp)
@@ -560,7 +653,8 @@ static u32 handle_block_output(int fd, c
 		len = writev(dev->fd, iov, num);
 		if (off + len > device_len) {
 			ftruncate(dev->fd, device_len);
-			errx(1, "Write past end %llu+%u", off, len);
+			errx(1, "Write past end %llu+%u", 
+					(unsigned long long)off, len);
 		}
 		*lenp = 0;
 	} else {
@@ -577,7 +671,7 @@ static void handle_output(int fd, unsign
 			  struct device_list *devices)
 {
 	struct device *i;
-	u32 *lenp;
+	unsigned long *lenp;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
 	unsigned num = 0;
 
@@ -639,7 +733,7 @@ static struct device *new_device(struct 
 				 int fd,
 				 bool (*handle_input)(int, struct device *),
 				 unsigned long watch_off,
-				 u32 (*handle_output)(int,
+				 unsigned long (*handle_output)(int,
 						      const struct iovec *,
 						      unsigned,
 						      struct device *))
@@ -916,9 +1010,12 @@ int main(int argc, char *argv[])
 {
 	unsigned long mem, pgdir, start, page_offset;
 	int c, lguest_fd, waker_fd;
+	int elf_fd;
 	struct device_list device_list;
 	struct lguest_boot_info *boot = (void *)0;
 	const char *initrd_name = NULL;
+	/* 64-bit header is bigger, so this is a worst case for both */
+	Elf64_Ehdr ehdr;
 
 	device_list.max_infd = -1;
 	device_list.dev = NULL;
@@ -958,8 +1055,8 @@ int main(int argc, char *argv[])
 	map_zeroed_pages(0, mem / getpagesize());
 
 	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
-			    &page_offset);
+	elf_fd = open_or_die(argv[optind+1], O_RDONLY);
+	start = load_kernel(elf_fd, &page_offset, &ehdr);
 
 	/* Write the device descriptors into memory. */
 	map_device_descriptors(&device_list, mem);
@@ -969,7 +1066,11 @@ int main(int argc, char *argv[])
 		boot->initrd_size = load_initrd(initrd_name, mem);
 
 	/* Set up the initial linar pagetables. */
+#ifndef __x86_64__
 	pgdir = setup_pagetables(mem, boot->initrd_size, page_offset);
+#else
+	pgdir = find_pgt_symbol(elf_fd, &ehdr, page_offset);
+#endif
 
 	/* Give the guest the boot information it needs. */
 	concat(boot->cmdline, argv+optind+2);
Index: linux-2.6.20/Documentation/lguest/x86_64/defines
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/x86_64/defines
@@ -0,0 +1,4 @@
+# For now on x86_64 we'll hard code the location of the lguest binary loader.
+# But when we can get a relocatable kernel, we'll have to work to make this
+# dynamic.
+LGUEST_GUEST_TOP := 0x7f000000
Index: linux-2.6.20/Documentation/lguest/x86_64/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/x86_64/lguest_defs.h
@@ -0,0 +1,15 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include <asm/vsyscall.h>
+
+/* LGUEST_TOP_ADDRESS comes from the Makefile */
+typedef uint64_t u64;
+#include "../../../include/asm/lguest_user.h"
+
+#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP
+
+
+#define BOOT_PGTABLE "boot_level4_pgt"
+
+#endif

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Unified lguest launcher
  2007-04-04 16:03 [PATCH] Unified lguest launcher Glauber de Oliveira Costa
@ 2007-04-05  2:44 ` Rusty Russell
  2007-04-05 14:43   ` Glauber de Oliveira Costa
  0 siblings, 1 reply; 4+ messages in thread
From: Rusty Russell @ 2007-04-05  2:44 UTC (permalink / raw)
  To: Glauber de Oliveira Costa; +Cc: virtualization, Linux Kernel Mailing List

On Wed, 2007-04-04 at 13:03 -0300, Glauber de Oliveira Costa wrote:
> This is a new version of the unified lguest launcher that applies to
> the current tree. According to rusty's suggestion, I'm bothering less
> to be able to load 32 bit kernels on 64-bit machines: changing the
> launcher for such case would be the easy part! In the absence of
> further objections, I'll commit it.
> 
> Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>

Hi Glauber!

The patch looks more than reasonable, but I think we can go further with
the abstraction.  If you could spin it again, I'll apply it.  There may
be more cleanups after that, but I don't want to hold up your progress!

> --- /dev/null
> +++ linux-2.6.20/Documentation/lguest/i386/defines
> @@ -0,0 +1,4 @@
> +# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
> +# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
> +include ../../.config
> +LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)

The include needs another ../ and seems redundant (the .config is
included from the Makefile anyway).

The shells comment is obsolete and should be deleted too, my bad.

> +++ linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
> @@ -0,0 +1,9 @@
> +#ifndef _LGUEST_DEFS_H_
> +#define _LGUEST_DEFS_H_
> +
> +/* LGUEST_TOP_ADDRESS comes from the Makefile */
> +#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP - 1024*1024

Why -1M?  And RESERVE_TOP_ADDRESS isn't used in this patch?

> +static unsigned long map_elf(int elf_fd, const void *hdr, 
>                              unsigned long *page_offset)
>  {
> -       void *addr;
> +#ifndef __x86_64__
> +       const Elf32_Ehdr *ehdr = hdr;
>         Elf32_Phdr phdr[ehdr->e_phnum];
> +#else
> +       const Elf64_Ehdr *ehdr = hdr;
> +       Elf64_Phdr phdr[ehdr->e_phnum];
> +#endif

The way we did this in the module code was to define Elf_Ehdr etc in the
arch-specific headers to avoid ifdefs.  I think it would help this code,
too. 

> +           || ((ehdr->e_machine != EM_386) &&
> +               (ehdr->e_machine != EM_X86_64))

Similarly define ELF_MACHINE?

>                else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
> +               else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
> +#ifdef __x86_64__
> +                        && (phdr[i].p_vaddr != VSYSCALL_START)
> +#endif
> +                       )

Hmm, static inline bool is_vsyscall_segment(const Elf_Phdr *) maybe?

> +/* LGUEST_TOP_ADDRESS comes from the Makefile */
> +typedef uint64_t u64;
> +#include "../../../include/asm/lguest_user.h"
> +
> +#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP
> +
> +
> +#define BOOT_PGTABLE "boot_level4_pgt"

The comment should refer to LGUEST_GUEST_TOP?

I think the typedef should be in the main code with the others: it
doesn't hurt i386 and it's neater.

I'm not sure the BOOT_PGTABLE define helps us here, either; it might be
clearer just to put it directly into the code.

Cheers!
Rusty.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Unified lguest launcher
  2007-04-05  2:44 ` Rusty Russell
@ 2007-04-05 14:43   ` Glauber de Oliveira Costa
  2007-04-06  2:19     ` Rusty Russell
  0 siblings, 1 reply; 4+ messages in thread
From: Glauber de Oliveira Costa @ 2007-04-05 14:43 UTC (permalink / raw)
  To: Rusty Russell; +Cc: virtualization, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 1039 bytes --]

and here's the new patch, merging rusty's suggestions and some more on my own.

May I upload this, or does Rusty (or any other) has some more suggestions?

On 4/4/07, Rusty Russell <rusty@rustcorp.com.au> wrote:
> On Wed, 2007-04-04 at 13:03 -0300, Glauber de Oliveira Costa wrote:
> > This is a new version of the unified lguest launcher that applies to
> > the current tree. According to rusty's suggestion, I'm bothering less
> > to be able to load 32 bit kernels on 64-bit machines: changing the
> > launcher for such case would be the easy part! In the absence of
> > further objections, I'll commit it.
> >
> > Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
>
> Hi Glauber!
>
> The patch looks more than reasonable, but I think we can go further with
> the abstraction.  If you could spin it again, I'll apply it.  There may
> be more cleanups after that, but I don't want to hold up your progress!
>

-- 
Glauber de Oliveira Costa.
"Free as in Freedom"

"The less confident you are, the more serious you have to act."

[-- Attachment #2: lguest_launcher.patch --]
[-- Type: text/x-patch, Size: 14646 bytes --]

 Makefile             |    9 +-
 i386/defines         |    1 
 i386/lguest_defs.h   |    9 ++
 lguest.c             |  177 +++++++++++++++++++++++++++++++++++++++------------
 x86_64/defines       |    1 
 x86_64/lguest_defs.h |   11 +++
 6 files changed, 165 insertions(+), 43 deletions(-)
Index: linux-2.6-mm/Documentation/lguest/Makefile
===================================================================
--- linux-2.6-mm.orig/Documentation/lguest/Makefile
+++ linux-2.6-mm/Documentation/lguest/Makefile
@@ -1,12 +1,13 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
 
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+# We could uname -i, but it seems to return unknown on a bunch of locations
+ARCH:=$(shell uname -m | sed s/i[3456]86/i386/)
+
 include ../../.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+include $(ARCH)/defines
 
 CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
-	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+	-static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds -I$(ARCH)
 LDLIBS:=-lz
 
 all: lguest.lds lguest
Index: linux-2.6-mm/Documentation/lguest/i386/defines
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/i386/defines
@@ -0,0 +1 @@
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
Index: linux-2.6-mm/Documentation/lguest/i386/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/i386/lguest_defs.h
@@ -0,0 +1,9 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include "../../../include/linux/lguest_launcher.h"
+typedef Elf32_Ehdr Elf_Ehdr;
+typedef Elf32_Phdr Elf_Phdr;
+#define ELF_MACHINE EM_386
+
+#endif
Index: linux-2.6-mm/Documentation/lguest/lguest.c
===================================================================
--- linux-2.6-mm.orig/Documentation/lguest/lguest.c
+++ linux-2.6-mm/Documentation/lguest/lguest.c
@@ -30,10 +30,11 @@
 #include <termios.h>
 #include <getopt.h>
 #include <zlib.h>
+typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
+#include <lguest_defs.h>
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
@@ -64,8 +65,8 @@ struct device
 
 	/* Watch DMA to this key if handle_input non-NULL. */
 	unsigned long watch_key;
-	u32 (*handle_output)(int fd, const struct iovec *iov,
-			     unsigned int num, struct device *me);
+	unsigned long (*handle_output)(int fd, const struct iovec *iov,
+			     	       unsigned int num, struct device *me);
 
 	/* Device-specific data. */
 	void *priv;
@@ -93,21 +94,31 @@ static void *map_zeroed_pages(unsigned l
 	return (void *)addr;
 }
 
+static inline bool is_vsyscall_segment(Elf_Phdr *phdr)
+{
+#ifdef __x86_64__
+	return phdr->p_vaddr == VSYSCALL_START;
+#else
+	return 0;
+#endif
+}
+
 /* Returns the entry point */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+static unsigned long map_elf(int elf_fd, Elf_Ehdr *ehdr,
 			     unsigned long *page_offset)
 {
+	Elf_Phdr phdr[ehdr->e_phnum];
 	void *addr;
-	Elf32_Phdr phdr[ehdr->e_phnum];
 	unsigned int i;
 
 	/* Sanity checks. */
 	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+	    || ehdr->e_machine != ELF_MACHINE
+	    || ehdr->e_phentsize != sizeof(phdr[0])
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(phdr[0]))
 		errx(1, "Malformed elf header");
 
+
 	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
 		err(1, "Seeking to program headers");
 	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
@@ -120,13 +131,14 @@ static unsigned long map_elf(int elf_fd,
 		if (phdr[i].p_type != PT_LOAD)
 			continue;
 
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+		verbose("Section %i: size %lu addr %p\n",
+		i, (unsigned long)phdr[i].p_memsz, (void *)phdr[i].p_paddr);
 
 		/* We expect linear address space. */
 		if (!*page_offset)
 			*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
-		else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+		else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+			 && (!is_vsyscall_segment(&phdr[i])))
 			errx(1, "Page offset of section %i different", i);
 
 		/* We map everything private, writable. */
@@ -210,15 +222,18 @@ static unsigned long load_bzimage(int fd
 	errx(1, "Could not find kernel in bzImage");
 }
 
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+/* In x86_64 systems, page table hierarchy does not start at a common location,
+ * but rather, is indicated by a symbol called boot_level4_pgt. Due to this, we
+ * need the elf header to survive this function, to lookup for the pgdir */
+static unsigned long load_kernel(int fd, unsigned long *page_offset,
+								Elf_Ehdr *hdr)
 {
-	Elf32_Ehdr hdr;
 
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+	if (read(fd, hdr, sizeof(*hdr)) != sizeof(*hdr))
 		err(1, "Reading kernel");
 
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr, page_offset);
+	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) == 0)
+		return map_elf(fd, hdr, page_offset);
 
 	return load_bzimage(fd, page_offset);
 }
@@ -250,6 +265,7 @@ static inline unsigned long page_align(u
 	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
 }
 
+#ifndef __x86_64__
 static unsigned long setup_pagetables(unsigned long mem,
 				      unsigned long initrd_size,
 				      unsigned long page_offset)
@@ -285,6 +301,74 @@ static unsigned long setup_pagetables(un
 
 	return (unsigned long)pgdir;
 }
+#else
+static unsigned long find_pgt_symbol(int elf_fd,
+				     Elf_Ehdr *ehdr,
+				     unsigned long page_offset)
+{
+	unsigned int i;
+	Elf64_Shdr sec[ehdr->e_shnum];
+	Elf64_Sym *syms;
+	char *strtab = NULL;
+	unsigned long pgdir_addr = 0;
+	unsigned long nsyms = 0;
+
+	/* Now process sections searching for boot page tables
+	 * Start by finding the symtab section */
+	if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+		err(1, "Seeking to section headers");
+	if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+		err(1, "Reading section headers");
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (sec[i].sh_type == SHT_SYMTAB) {
+			int ret = 0;
+			syms = malloc(sec[i].sh_size);
+			if (!syms)
+				err(1,"Not enough memory for symbol table");
+			ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to symbol table");
+			ret = read(elf_fd, syms, sec[i].sh_size);
+			if (ret != sec[i].sh_size)
+				err(1, "Reading symbol table");
+			nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+			/* symtab links to strtab. We use it to find symbol
+			 * names */
+			strtab = malloc(sec[sec[i].sh_link].sh_size);
+			if (!strtab)
+				err(1,"Not enough memory for string table");
+			ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+			if (ret < 0)
+				err(1, "Seeking to string table");
+			ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+			if (ret != sec[sec[i].sh_link].sh_size)
+				err(1, "Reading string table");
+			break;
+		}
+	}
+
+	/* We now have a pointer to the symtab, start searching for the symbol */
+	for (i = 0; i < nsyms; i++) {
+		if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+			continue;
+		if (!strcmp("boot_level4_pgt",
+				(char *)((u64)syms[i].st_name + strtab))) {
+			pgdir_addr = syms[i].st_value - page_offset;
+			break;
+		}
+	}
+
+	if (!pgdir_addr) {
+		errno = ESRCH;
+		err(1,"Unable to find boot pgdir");
+	}
+
+	return pgdir_addr;
+}
+#endif
 
 static void concat(char *dst, char *args[])
 {
@@ -299,9 +383,10 @@ static void concat(char *dst, char *args
 	dst[len] = '\0';
 }
 
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(unsigned long pgdir, unsigned long start,
+					    unsigned long page_offset)
 {
-	u32 args[] = { LHREQ_INITIALIZE,
+	unsigned long args[] = { LHREQ_INITIALIZE,
 		       LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
 		       pgdir, start, page_offset };
 	int fd;
@@ -379,7 +464,8 @@ static void *_check_pointer(unsigned lon
 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
 
 /* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+static unsigned long *dma2iov(unsigned long dma, struct iovec iov[],
+								unsigned *num)
 {
 	unsigned int i;
 	struct lguest_dma *udma;
@@ -396,12 +482,12 @@ static u32 *dma2iov(unsigned long dma, s
 	return &udma->used_len;
 }
 
-static u32 *get_dma_buffer(int fd, void *key,
+static unsigned long *get_dma_buffer(int fd, void *key,
 			   struct iovec iov[], unsigned int *num, u32 *irq)
 {
-	u32 buf[] = { LHREQ_GETDMA, (u32)key };
+	unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)key };
 	unsigned long udma;
-	u32 *res;
+	unsigned long *res;
 
 	udma = write(fd, buf, sizeof(buf));
 	if (udma == (unsigned long)-1)
@@ -415,7 +501,7 @@ static u32 *get_dma_buffer(int fd, void 
 
 static void trigger_irq(int fd, u32 irq)
 {
-	u32 buf[] = { LHREQ_IRQ, irq };
+	unsigned long buf[] = { LHREQ_IRQ, irq };
 	if (write(fd, buf, sizeof(buf)) != 0)
 		err(1, "Triggering irq %i", irq);
 }
@@ -443,7 +529,8 @@ struct console_abort
 /* We DMA input to buffer bound at start of console page. */
 static bool handle_console_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0;
+	unsigned long *lenp;
 	int len;
 	unsigned int num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -487,14 +574,14 @@ static bool handle_console_input(int fd,
 	return true;
 }
 
-static u32 handle_console_output(int fd, const struct iovec *iov,
-				 unsigned num, struct device*dev)
+static unsigned long  handle_console_output(int fd, const struct iovec *iov,
+					 unsigned num, struct device*dev)
 {
 	return writev(STDOUT_FILENO, iov, num);
 }
 
-static u32 handle_tun_output(int fd, const struct iovec *iov,
-			     unsigned num, struct device *dev)
+static unsigned long  handle_tun_output(int fd, const struct iovec *iov,
+					     unsigned num, struct device *dev)
 {
 	/* Now we've seen output, we should warn if we can't get buffers. */
 	*(bool *)dev->priv = true;
@@ -508,7 +595,8 @@ static unsigned long peer_offset(unsigne
 
 static bool handle_tun_input(int fd, struct device *dev)
 {
-	u32 irq = 0, *lenp;
+	u32 irq = 0;
+	unsigned long *lenp;
 	int len;
 	unsigned num;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -534,11 +622,12 @@ static bool handle_tun_input(int fd, str
 	return true;
 }
 
-static u32 handle_block_output(int fd, const struct iovec *iov,
-			       unsigned num, struct device *dev)
+static unsigned long handle_block_output(int fd, const struct iovec *iov,
+					 unsigned num, struct device *dev)
 {
 	struct lguest_block_page *p = dev->mem;
-	u32 irq, *lenp;
+	u32 irq;
+	unsigned long *lenp;
 	unsigned int len, reply_num;
 	struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
 	off64_t device_len, off = (off64_t)p->sector * 512;
@@ -546,11 +635,14 @@ static u32 handle_block_output(int fd, c
 	device_len = *(off64_t *)dev->priv;
 
 	if (off >= device_len)
-		err(1, "Bad offset %llu vs %llu", off, device_len);
+		err(1, "Bad offset %llu vs %llu",
+				(unsigned long long)off,
+				(unsigned long long)device_len);
 	if (lseek64(dev->fd, off, SEEK_SET) != off)
 		err(1, "Bad seek to sector %i", p->sector);
 
-	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+	verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ",
+						(unsigned long long)off);
 
 	lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
 	if (!lenp)
@@ -560,7 +652,8 @@ static u32 handle_block_output(int fd, c
 		len = writev(dev->fd, iov, num);
 		if (off + len > device_len) {
 			ftruncate(dev->fd, device_len);
-			errx(1, "Write past end %llu+%u", off, len);
+			errx(1, "Write past end %llu+%u",
+					(unsigned long long)off, len);
 		}
 		*lenp = 0;
 	} else {
@@ -577,7 +670,7 @@ static void handle_output(int fd, unsign
 			  struct device_list *devices)
 {
 	struct device *i;
-	u32 *lenp;
+	unsigned long *lenp;
 	struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
 	unsigned num = 0;
 
@@ -639,7 +732,7 @@ static struct device *new_device(struct 
 				 int fd,
 				 bool (*handle_input)(int, struct device *),
 				 unsigned long watch_off,
-				 u32 (*handle_output)(int,
+				 unsigned long (*handle_output)(int,
 						      const struct iovec *,
 						      unsigned,
 						      struct device *))
@@ -916,9 +1009,11 @@ int main(int argc, char *argv[])
 {
 	unsigned long mem, pgdir, start, page_offset;
 	int c, lguest_fd, waker_fd;
+	int elf_fd;
 	struct device_list device_list;
 	struct lguest_boot_info *boot = (void *)0;
 	const char *initrd_name = NULL;
+	Elf_Ehdr ehdr;
 
 	device_list.max_infd = -1;
 	device_list.dev = NULL;
@@ -958,8 +1053,8 @@ int main(int argc, char *argv[])
 	map_zeroed_pages(0, mem / getpagesize());
 
 	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
-			    &page_offset);
+	elf_fd = open_or_die(argv[optind+1], O_RDONLY);
+	start = load_kernel(elf_fd, &page_offset, &ehdr);
 
 	/* Write the device descriptors into memory. */
 	map_device_descriptors(&device_list, mem);
@@ -969,7 +1064,11 @@ int main(int argc, char *argv[])
 		boot->initrd_size = load_initrd(initrd_name, mem);
 
 	/* Set up the initial linar pagetables. */
+#ifndef __x86_64__
 	pgdir = setup_pagetables(mem, boot->initrd_size, page_offset);
+#else
+	pgdir = find_pgt_symbol(elf_fd, &ehdr, page_offset);
+#endif
 
 	/* Give the guest the boot information it needs. */
 	concat(boot->cmdline, argv+optind+2);
Index: linux-2.6-mm/Documentation/lguest/x86_64/defines
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/x86_64/defines
@@ -0,0 +1 @@
+LGUEST_GUEST_TOP := 0x7f000000
Index: linux-2.6-mm/Documentation/lguest/x86_64/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6-mm/Documentation/lguest/x86_64/lguest_defs.h
@@ -0,0 +1,11 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include <asm/vsyscall.h>
+
+#include "../../../include/asm/lguest_user.h"
+typedef Elf64_Ehdr Elf_Ehdr;
+typedef Elf64_Phdr Elf_Phdr;
+#define ELF_MACHINE EM_X86_64
+
+#endif

[-- Attachment #3: Type: text/plain, Size: 189 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Unified lguest launcher
  2007-04-05 14:43   ` Glauber de Oliveira Costa
@ 2007-04-06  2:19     ` Rusty Russell
  0 siblings, 0 replies; 4+ messages in thread
From: Rusty Russell @ 2007-04-06  2:19 UTC (permalink / raw)
  To: Glauber de Oliveira Costa; +Cc: virtualization, Linux Kernel Mailing List

On Thu, 2007-04-05 at 11:43 -0300, Glauber de Oliveira Costa wrote:
> and here's the new patch, merging rusty's suggestions and some more on my own.
> 
> May I upload this, or does Rusty (or any other) has some more suggestions?

This looks excellent!

There are a couple of extra spaces floating around, but that's trivial.
You use "errno = ESRCH; err()" where you could use "errx()".

Please merge it straight in. No need for a separate patch in the tree
for this I think, unless you plan more work?

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2007-04-06  2:19 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-04 16:03 [PATCH] Unified lguest launcher Glauber de Oliveira Costa
2007-04-05  2:44 ` Rusty Russell
2007-04-05 14:43   ` Glauber de Oliveira Costa
2007-04-06  2:19     ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).