public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] QEMU/KVM: large page support
@ 2008-02-23 14:48 Marcelo Tosatti
  2008-02-23 18:29 ` Anthony Liguori
  0 siblings, 1 reply; 4+ messages in thread
From: Marcelo Tosatti @ 2008-02-23 14:48 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel


Add an option so the user can specify the hugetlbfs mounted path, with
fallback to 4k pages on error.

Align the 4GB+ memslot on large page boundary.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Index: kvm-userspace.gitpara/qemu/hw/pc.c
===================================================================
--- kvm-userspace.gitpara.orig/qemu/hw/pc.c
+++ kvm-userspace.gitpara/qemu/hw/pc.c
@@ -854,6 +854,15 @@ static void pc_init1(ram_addr_t ram_size
     /* above 4giga memory allocation */
     if (above_4g_mem_size > 0) {
         ram_addr = qemu_ram_alloc(above_4g_mem_size);
+	if (hpagesize) {
+		if (ram_addr & (hpagesize-1)) {
+			unsigned long aligned_addr;
+			aligned_addr = (ram_addr + hpagesize - 1) &
+					 ~(hpagesize-1);
+			qemu_ram_alloc(aligned_addr - ram_addr);
+			ram_addr = aligned_addr;
+		}
+	}
         cpu_register_physical_memory(0x100000000, above_4g_mem_size, ram_addr);
 
 	if (kvm_enabled())
Index: kvm-userspace.gitpara/qemu/sysemu.h
===================================================================
--- kvm-userspace.gitpara.orig/qemu/sysemu.h
+++ kvm-userspace.gitpara/qemu/sysemu.h
@@ -99,6 +99,7 @@ extern int no_quit;
 extern int semihosting_enabled;
 extern int autostart;
 extern int old_param;
+extern int hpagesize;
 extern const char *bootp_filename;
 
 
Index: kvm-userspace.gitpara/qemu/vl.c
===================================================================
--- kvm-userspace.gitpara.orig/qemu/vl.c
+++ kvm-userspace.gitpara/qemu/vl.c
@@ -237,6 +237,9 @@ int semihosting_enabled = 0;
 int autostart = 1;
 int time_drift_fix = 0;
 unsigned int kvm_shadow_memory = 0;
+const char *hugetlbpath = NULL;
+const char *hugetlbfile = NULL;
+int hpagesize = 0;
 const char *cpu_vendor_string;
 #ifdef TARGET_ARM
 int old_param = 0;
@@ -8071,6 +8074,7 @@ static void help(int exitcode)
 #endif
            "-tdf            inject timer interrupts that got lost\n"
            "-kvm-shadow-memory megs set the amount of shadow pages to be allocated\n"
+           "-hugetlb-path   set the path to hugetlbfs mounted directory, also enables allocation of guest memory with huge pages\n"
 	   "-option-rom rom load a file, rom, into the option ROM space\n"
 #ifdef TARGET_SPARC
            "-prom-env variable=value  set OpenBIOS nvram variables\n"
@@ -8189,6 +8193,7 @@ enum {
     QEMU_OPTION_incoming,
     QEMU_OPTION_tdf,
     QEMU_OPTION_kvm_shadow_memory,
+    QEMU_OPTION_hugetlbpath,
 };
 
 typedef struct QEMUOption {
@@ -8310,6 +8315,7 @@ const QEMUOption qemu_options[] = {
 #endif
     { "clock", HAS_ARG, QEMU_OPTION_clock },
     { "startdate", HAS_ARG, QEMU_OPTION_startdate },
+    { "hugetlb-path", HAS_ARG, QEMU_OPTION_hugetlbpath },
     { NULL },
 };
 
@@ -8561,6 +8567,94 @@ void qemu_get_launch_info(int *argc, cha
     *opt_incoming = incoming;
 }
 
+
+static int gethugepagesize(void)
+{ 
+	int ret, fd;
+	char buf[4096];
+	char *needle = "Hugepagesize:";
+	char *size;
+	unsigned long hugepagesize;
+
+	fd = open("/proc/meminfo", O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		exit(0);
+	}
+
+	ret = read(fd, buf, sizeof(buf));
+	if (ret < 0) {
+		perror("read");
+		exit(0);
+	}
+
+	size = strstr(buf, needle);
+	if (!size)
+		return 0;
+	size += strlen(needle);
+	hugepagesize = strtol(size, NULL, 0);
+	return hugepagesize;
+}
+
+void cleanup_hugetlb(void)
+{
+	if (hugetlbfile)
+		unlink(hugetlbfile);
+}
+
+void *alloc_huge_area(unsigned long memory, const char *path)
+{
+	void *area;
+	int fd;
+	char *filename;
+	char *tmpfile = "/kvm.XXXXXX";
+
+	filename = qemu_malloc(4096);
+	if (!filename) 
+		return NULL;
+
+	memset(filename, 0, 4096);
+	strncpy(filename, path, 4096 - strlen(tmpfile) - 1);
+	strcat(filename, tmpfile);
+	
+	hpagesize = gethugepagesize() * 1024;
+	if (!hpagesize)
+		return NULL;
+
+	mkstemp(filename);
+	fd = open(filename, O_RDWR);
+	if (fd < 0) {
+		perror("open");
+		hpagesize = 0;
+		exit(0);
+	}
+	memory = (memory+hpagesize-1) & ~(hpagesize-1);
+
+	area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+	if (area == MAP_FAILED) {
+		perror("mmap");
+		hpagesize = 0;
+		exit(0);
+	}
+
+	hugetlbfile = filename;
+	atexit(cleanup_hugetlb);
+
+	return area;
+}
+
+void *qemu_alloc_physram(unsigned long memory)
+{
+	void *area = NULL;
+
+	if (hugetlbpath)
+		area = alloc_huge_area(memory, hugetlbpath);
+	if (!area)
+		area = qemu_vmalloc(memory);
+
+	return area;
+}
+
 int main(int argc, char **argv)
 {
 #ifdef CONFIG_GDBSTUB
@@ -9159,6 +9253,9 @@ int main(int argc, char **argv)
             case QEMU_OPTION_kvm_shadow_memory:
                 kvm_shadow_memory = (int64_t)atoi(optarg) * 1024 * 1024 / 4096;
                 break;
+            case QEMU_OPTION_hugetlbpath:
+		hugetlbpath = optarg;
+		break;
             case QEMU_OPTION_name:
                 qemu_name = optarg;
                 break;
@@ -9394,7 +9491,7 @@ int main(int argc, char **argv)
 
             ret = kvm_qemu_check_extension(KVM_CAP_USER_MEMORY);
             if (ret) {
-                phys_ram_base = qemu_vmalloc(phys_ram_size);
+                phys_ram_base = qemu_alloc_physram(phys_ram_size);
 	        if (!phys_ram_base) {
 		        fprintf(stderr, "Could not allocate physical memory\n");
 		        exit(1);

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] QEMU/KVM: large page support
  2008-02-23 14:48 [PATCH] QEMU/KVM: large page support Marcelo Tosatti
@ 2008-02-23 18:29 ` Anthony Liguori
  2008-02-23 22:54   ` Marcelo Tosatti
  0 siblings, 1 reply; 4+ messages in thread
From: Anthony Liguori @ 2008-02-23 18:29 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

Hi Marcelo,

Marcelo Tosatti wrote:
> Add an option so the user can specify the hugetlbfs mounted path, with
> fallback to 4k pages on error.
>
> Align the 4GB+ memslot on large page boundary.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> Index: kvm-userspace.gitpara/qemu/hw/pc.c
> ===================================================================
> --- kvm-userspace.gitpara.orig/qemu/hw/pc.c
> +++ kvm-userspace.gitpara/qemu/hw/pc.c
> @@ -854,6 +854,15 @@ static void pc_init1(ram_addr_t ram_size
>      /* above 4giga memory allocation */
>      if (above_4g_mem_size > 0) {
>          ram_addr = qemu_ram_alloc(above_4g_mem_size);
> +	if (hpagesize) {
> +		if (ram_addr & (hpagesize-1)) {
> +			unsigned long aligned_addr;
> +			aligned_addr = (ram_addr + hpagesize - 1) &
> +					 ~(hpagesize-1);
> +			qemu_ram_alloc(aligned_addr - ram_addr);
> +			ram_addr = aligned_addr;
> +		}
> +	}
>   

In general, I don't think it causes any real harm if we always align the 
ram address to a large page boundary.  If we aren't on Linux (and can't 
determine what the large page size is), we can just set hpagesize to 
getpagesize().  I think there's a good reason for this that I'll explain 
below.

> +
> +void *alloc_huge_area(unsigned long memory, const char *path)
> +{
> +	void *area;
> +	int fd;
> +	char *filename;
> +	char *tmpfile = "/kvm.XXXXXX";
> +
> +	filename = qemu_malloc(4096);
> +	if (!filename) 
> +		return NULL;
> +
> +	memset(filename, 0, 4096);
> +	strncpy(filename, path, 4096 - strlen(tmpfile) - 1);
> +	strcat(filename, tmpfile);
> +	
> +	hpagesize = gethugepagesize() * 1024;
> +	if (!hpagesize)
> +		return NULL;
> +
> +	mkstemp(filename);
>   

mkstemp returns a file descriptor so the following open is not required.

> +	fd = open(filename, O_RDWR);
> +	if (fd < 0) {
> +		perror("open");
> +		hpagesize = 0;
> +		exit(0);
> +	}
> +	memory = (memory+hpagesize-1) & ~(hpagesize-1);
>   

I'm a little surprised that hugetlbfs doesn't require an ftruncate() 
before the mmap().  Does an ftruncate() do any harm?  If so, it would be 
better to have one.

> +	area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
>   

I think a MAP_SHARED may have some advantages in that it then becomes 
possible to pass this file descriptor around to other processes so they 
can mmap() the same memory region.  I don't know if that works with 
hugetlbfs but it certainly does with tmpfs.  My thinking is that this 
code can be made generic so it works with either hugetlbfs or tmpfs.

Furthermore, I think it would be interesting if we defaulted to trying 
to create the memory file in something like /dev/kvm-mem or something 
more appropriately named.  An administrator can then either mount a 
hugetlbfs or tmpfs mount there.  We still probably want to provide an 
option to override where to create the file and we want to be able to 
fall back to a normal malloc() of course but at least this makes it 
possible for the distros to set things up so that it Just Work without 
the user having to understand things like hugetlbfs and tmpfs.

Maybe we'll even see something that merges the two filesystems in the 
future so that if a huge page allocation fails, it falls back to 
creating a normal tmpfs file.  Perhaps that's a reasonable mount option 
to add to hugetlbfs.

> +	if (area == MAP_FAILED) {
> +		perror("mmap");
> +		hpagesize = 0;
> +		exit(0);
> +	}
> +
> +	hugetlbfile = filename;
> +	atexit(cleanup_hugetlb);
>   

Instead of registering an atexit() handler, I think it would be better 
to unlink immediately after doing the mkstemp().  This reduces the 
possibility of leaking the file in the event of catastrophe (like a kill 
-SIGKILL).

Regards,

Anthony Liguori

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] QEMU/KVM: large page support
  2008-02-23 18:29 ` Anthony Liguori
@ 2008-02-23 22:54   ` Marcelo Tosatti
  2008-02-23 22:56     ` Anthony Liguori
  0 siblings, 1 reply; 4+ messages in thread
From: Marcelo Tosatti @ 2008-02-23 22:54 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Marcelo Tosatti, kvm-devel, Avi Kivity

Hi Anthony, 

Thanks for your comments.

On Sat, Feb 23, 2008 at 12:29:27PM -0600, Anthony Liguori wrote:

> In general, I don't think it causes any real harm if we always align the 
> ram address to a large page boundary.  If we aren't on Linux (and can't 
> determine what the large page size is), we can just set hpagesize to 
> getpagesize().  I think there's a good reason for this that I'll explain 
> below.

I thought about doing that (gets rid of the 4GB+ special casing) but we
lose the ability to compact smaller allocations in a single largepage.

Right now the VGA BIOS and the BIOS fit in the same largepage, for
example.

> > +
> > +void *alloc_huge_area(unsigned long memory, const char *path)
> > +{
> > +	void *area;
> > +	int fd;
> > +	char *filename;
> > +	char *tmpfile = "/kvm.XXXXXX";
> > +
> > +	filename = qemu_malloc(4096);
> > +	if (!filename) 
> > +		return NULL;
> > +
> > +	memset(filename, 0, 4096);
> > +	strncpy(filename, path, 4096 - strlen(tmpfile) - 1);
> > +	strcat(filename, tmpfile);
> > +	
> > +	hpagesize = gethugepagesize() * 1024;
> > +	if (!hpagesize)
> > +		return NULL;
> > +
> > +	mkstemp(filename);
> >   
> 
> mkstemp returns a file descriptor so the following open is not required.

Right, will fix.

> > +	fd = open(filename, O_RDWR);
> > +	if (fd < 0) {
> > +		perror("open");
> > +		hpagesize = 0;
> > +		exit(0);
> > +	}
> > +	memory = (memory+hpagesize-1) & ~(hpagesize-1);
> >   
> 
> I'm a little surprised that hugetlbfs doesn't require an ftruncate() 
> before the mmap().  Does an ftruncate() do any harm?  If so, it would be 
> better to have one.

Its not needed because hugetlbfs automatically adjusts the inode size on
demand.

Don't think it will do any harm and its compliant with normal
filesystems.

> > +	area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
> >   
> 
> I think a MAP_SHARED may have some advantages in that it then becomes 
> possible to pass this file descriptor around to other processes so they 
> can mmap() the same memory region.  I don't know if that works with 
> hugetlbfs but it certainly does with tmpfs.  My thinking is that this 
> code can be made generic so it works with either hugetlbfs or tmpfs.

Yes, makes sense.

> Furthermore, I think it would be interesting if we defaulted to trying 
> to create the memory file in something like /dev/kvm-mem or something 
> more appropriately named.  An administrator can then either mount a 
> hugetlbfs or tmpfs mount there.  We still probably want to provide an 
> option to override where to create the file and we want to be able to 
> fall back to a normal malloc() of course but at least this makes it 
> possible for the distros to set things up so that it Just Work without 
> the user having to understand things like hugetlbfs and tmpfs.

I like the idea. I'll change "-hugetlb-path" to "-mem-path" for now (and
test it with tmpfs). Not so fond of hardcoding a path in QEMU though.

> Maybe we'll even see something that merges the two filesystems in the 
> future so that if a huge page allocation fails, it falls back to 
> creating a normal tmpfs file.  Perhaps that's a reasonable mount option 
> to add to hugetlbfs.

> Instead of registering an atexit() handler, I think it would be better 
> to unlink immediately after doing the mkstemp().  This reduces the 
> possibility of leaking the file in the event of catastrophe (like a kill 
> -SIGKILL).

Right, will fix.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] QEMU/KVM: large page support
  2008-02-23 22:54   ` Marcelo Tosatti
@ 2008-02-23 22:56     ` Anthony Liguori
  0 siblings, 0 replies; 4+ messages in thread
From: Anthony Liguori @ 2008-02-23 22:56 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

Marcelo Tosatti wrote:
> I thought about doing that (gets rid of the 4GB+ special casing) but we
> lose the ability to compact smaller allocations in a single largepage.
>
> Right now the VGA BIOS and the BIOS fit in the same largepage, for
> example.
>   

Ah, good point.  I was actually talking about doing this adjustment for 
ram regardless of whether we're using large pages--not necessarily for 
all memory allocations.  I saw no harm in doing this alignment even if 
we were using tmpfs and just getting 4k pages.  Of course, I think we're 
in agreement here though :-)

> I like the idea. I'll change "-hugetlb-path" to "-mem-path" for now (and
> test it with tmpfs). Not so fond of hardcoding a path in QEMU though.
>   

If we had a unified tmpfs/hugetlbfs, I think it would make a lot of 
sense.  I think I need to ask around though first and see how reasonable 
that is.

Regards,

Anthony Liguori


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-02-23 22:56 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-02-23 14:48 [PATCH] QEMU/KVM: large page support Marcelo Tosatti
2008-02-23 18:29 ` Anthony Liguori
2008-02-23 22:54   ` Marcelo Tosatti
2008-02-23 22:56     ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox