public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* KVM: PCIPT: direct mmio
@ 2008-06-03 11:21 Ben-Ami Yassour
  2008-06-04 14:16 ` Avi Kivity
  0 siblings, 1 reply; 4+ messages in thread
From: Ben-Ami Yassour @ 2008-06-03 11:21 UTC (permalink / raw)
  To: Amit Shah; +Cc: kvm, Han Weidong, Kay, Allen M, Muli Ben-Yehuda

Amit,

Below is the patch for PCI passthrough tree, it enables a guest to access 
a device's
memory mapped I/O regions directly, without requiring the host to trap and
emulate every MMIO access.

This patch requires only userspace changes and it is relaying on the 
kernel patch by Anthony: "Handle vma regions with no backing page". Note 
that this patch requires CONFIG_NUMA to be set. It does require a change 
to the VT-d that Allen sent a while ago, to avoid mapping of memory slots 
with no backing page.

This patch was tested with the pci-passthrough VT-d using an e1000 NIC.

Regards,
Ben

>From 8fe13bcea014a3b896a79fca5d15ddd32050694c Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Tue, 3 Jun 2008 12:34:51 +0300
Subject: [PATCH] KVM: PCIPT: direct mmio

This patch for PCI passthrough devices enables a guest to access a
device's memory mapped I/O regions directly, without requiring the host to trap
and emulate every MMIO access.

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
  libkvm/libkvm.c           |   24 ++++++++----
  qemu/hw/pci-passthrough.c |   89 +++++++++++----------------------------------
  qemu/hw/pci-passthrough.h |    2 +
  3 files changed, 40 insertions(+), 75 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index d1e95a4..ce062cb 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
  {
  	int r;
  	int prot = PROT_READ;
-	void *ptr;
+	void *ptr = NULL;
  	struct kvm_userspace_memory_region memory = {
  		.memory_size = len,
  		.guest_phys_addr = phys_start,
@@ -410,16 +410,24 @@ void *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long phys_start,
  	if (writable)
  		prot |= PROT_WRITE;

-	ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
-	if (ptr == MAP_FAILED) {
-		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
-		return 0;
-	}
+	if (len > 0) {
+		ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+		if (ptr == MAP_FAILED) {
+			fprintf(stderr, "create_userspace_phys_mem: %s",
+				strerror(errno));
+			return 0;
+		}

-	memset(ptr, 0, len);
+		memset(ptr, 0, len);
+	}

  	memory.userspace_addr = (unsigned long)ptr;
-	memory.slot = get_free_slot(kvm);
+
+	if (len > 0)
+		memory.slot = get_free_slot(kvm);
+	else
+		memory.slot = get_slot(phys_start);
+
  	r = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &memory);
  	if (r == -1) {
  		fprintf(stderr, "create_userspace_phys_mem: %s", strerror(errno));
diff --git a/qemu/hw/pci-passthrough.c b/qemu/hw/pci-passthrough.c
index 62953ae..1a3e50c 100644
--- a/qemu/hw/pci-passthrough.c
+++ b/qemu/hw/pci-passthrough.c
@@ -25,18 +25,6 @@ typedef __u64 resource_size_t;
  extern kvm_context_t kvm_context;
  extern FILE *logfile;

-CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
-	pt_mmio_readb,
-	pt_mmio_readw,
-	pt_mmio_readl
-};
-
-CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
-	pt_mmio_writeb,
-	pt_mmio_writew,
-	pt_mmio_writel
-};
-
  //#define PT_DEBUG

  #ifdef PT_DEBUG
@@ -45,47 +33,6 @@ CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
  #define DEBUG(fmt, args...)
  #endif

-#define pt_mmio_write(suffix, type)					\
-void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys,	\
-				uint32_t value)				\
-{									\
-	pt_region_t *r_access = (pt_region_t *)opaque;			\
-	void *r_virt = (u8 *)r_access->r_virtbase + 			\
-			(e_phys - r_access->e_physbase);		\
-	if (r_access->debug & PT_DEBUG_MMIO) {				\
-		fprintf(logfile, "pt_mmio_write" #suffix		\
-			": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
-			(void *)r_access->e_physbase, (void *)e_phys,	\
-			r_virt, value);					\
-	}								\
-	*(type *)r_virt = (type)value;					\
-}
-
-pt_mmio_write(b, u8)
-pt_mmio_write(w, u16)
-pt_mmio_write(l, u32)
-
-#define pt_mmio_read(suffix, type)					\
-uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys)	\
-{									\
-	pt_region_t *r_access = (pt_region_t *)opaque;			\
-	void *r_virt = (u8 *)r_access->r_virtbase + 			\
-			(e_phys - r_access->e_physbase);		\
-	uint32_t value = (u32) (*(type *) r_virt);			\
-	if (r_access->debug & PT_DEBUG_MMIO) {				\
-		fprintf(logfile,					\
-			"pt_mmio_read" #suffix ": e_physbase=%p "	\
-			"e_phys=%p r_virt=%p value=%08x\n",		\
-			(void *)r_access->e_physbase,			\
-			(void *)e_phys, r_virt, value);			\
-	}								\
-	return value;							\
-}
-
-pt_mmio_read(b, u8)
-pt_mmio_read(w, u16)
-pt_mmio_read(l, u32)
-
  #define pt_ioport_write(suffix)						\
  void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \
  {									\
@@ -127,22 +74,33 @@ pt_ioport_read(b)
  pt_ioport_read(w)
  pt_ioport_read(l)

-static void pt_iomem_map(PCIDevice * d, int region_num,
-			 uint32_t e_phys, uint32_t e_size, int type)
+void pt_iomem_map(PCIDevice * pci_dev, int region_num, uint32_t e_phys,
+		  uint32_t e_size, int type)
  {
-	pt_dev_t *r_dev = (pt_dev_t *) d;
-
-	r_dev->v_addrs[region_num].e_physbase = e_phys;
+	pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+	pt_region_t *region = &r_dev->v_addrs[region_num];
+	int first_map = (region->e_size == 0);
+	int ret = 0;

  	DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
  	      e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
  	      region_num);

-	cpu_register_physical_memory(e_phys,
-				     r_dev->dev.io_regions[region_num].size,
-				     r_dev->v_addrs[region_num].memory_index);
-}
+	region->e_physbase = e_phys;
+	region->e_size = e_size;
+
+	if (!first_map)
+		kvm_destroy_phys_mem(kvm_context, e_phys, e_size);

+	if (e_size > 0)
+		ret = kvm_register_userspace_phys_mem(kvm_context,
+						      e_phys,
+						      region->r_virtbase,
+						      e_size,
+						      0);
+	if (ret != 0)
+		fprintf(logfile, "Error: create new mapping failed\n");
+}

  static void pt_ioport_map(PCIDevice * pci_dev, int region_num,
  			  uint32_t addr, uint32_t size, int type)
@@ -265,6 +223,8 @@ static int pt_register_regions(pci_region_t * io_regions,
  					(uint32_t) (cur_region->base_addr));
  				return (-1);
  			}
+			pci_dev->v_addrs[i].r_size = cur_region->size;
+			pci_dev->v_addrs[i].e_size = 0;

  			/* add offset */
  			pci_dev->v_addrs[i].r_virtbase +=
@@ -274,11 +234,6 @@ static int pt_register_regions(pci_region_t * io_regions,
  					       cur_region->size, t,
  					       pt_iomem_map);

-			pci_dev->v_addrs[i].memory_index =
-			    cpu_register_io_memory(0, pt_mmio_read_cb,
-						   pt_mmio_write_cb,
-						   (void *) &(pci_dev->v_addrs[i]));
-
  			continue;
  		}
  		/* handle port io regions */
diff --git a/qemu/hw/pci-passthrough.h b/qemu/hw/pci-passthrough.h
index 1dcf752..67ac4dc 100644
--- a/qemu/hw/pci-passthrough.h
+++ b/qemu/hw/pci-passthrough.h
@@ -54,6 +54,8 @@ typedef struct pt_region_s {
  	uint32_t memory_index;
  	void *r_virtbase;	/* mmapped access address */
  	int num;		/* our index within v_addrs[] */
+	uint32_t e_size;          /* emulated size of region in bytes */
+	uint32_t r_size;          /* real size of region in bytes */
  	uint32_t debug;
  } pt_region_t;

-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: KVM: PCIPT: direct mmio
  2008-06-03 11:21 Ben-Ami Yassour
@ 2008-06-04 14:16 ` Avi Kivity
  0 siblings, 0 replies; 4+ messages in thread
From: Avi Kivity @ 2008-06-04 14:16 UTC (permalink / raw)
  To: Ben-Ami Yassour
  Cc: Amit Shah, kvm, Han Weidong, Kay, Allen M, Muli Ben-Yehuda

Ben-Ami Yassour wrote:
> Amit,
>
> Below is the patch for PCI passthrough tree, it enables a guest to 
> access a device's
> memory mapped I/O regions directly, without requiring the host to trap 
> and
> emulate every MMIO access.
>
> This patch requires only userspace changes and it is relaying on the 
> kernel patch by Anthony: "Handle vma regions with no backing page". 
> Note that this patch requires CONFIG_NUMA to be set. It does require a 
> change to the VT-d that Allen sent a while ago, to avoid mapping of 
> memory slots with no backing page.
>
>
> diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
> index d1e95a4..ce062cb 100644
> --- a/libkvm/libkvm.c
> +++ b/libkvm/libkvm.c
> @@ -400,7 +400,7 @@ void *kvm_create_userspace_phys_mem(kvm_context_t 
> kvm, unsigned long phys_start,
>  {
>      int r;
>      int prot = PROT_READ;
> -    void *ptr;
> +    void *ptr = NULL;
>      struct kvm_userspace_memory_region memory = {
>          .memory_size = len,
>          .guest_phys_addr = phys_start,
> @@ -410,16 +410,24 @@ void 
> *kvm_create_userspace_phys_mem(kvm_context_t kvm, unsigned long 
> phys_start,
>      if (writable)
>          prot |= PROT_WRITE;
>
> -    ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
> -    if (ptr == MAP_FAILED) {
> -        fprintf(stderr, "create_userspace_phys_mem: %s", 
> strerror(errno));
> -        return 0;
> -    }
> +    if (len > 0) {
> +        ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
> +        if (ptr == MAP_FAILED) {
> +            fprintf(stderr, "create_userspace_phys_mem: %s",
> +                strerror(errno));
> +            return 0;
> +        }

You're using 'len == 0' here to change the semantics of the function.  
It would be better to have two different APIs (perhaps sharing some of 
the implementation by calling a helper).

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: KVM: PCIPT: direct mmio
       [not found] <OF02BEEB69.3AC5BC32-ONC225745E.007719ED-C225745E.00772C47@il.ibm.com>
@ 2008-06-04 22:01 ` Ben-Ami Yassour
  2008-06-05 13:20   ` Avi Kivity
  0 siblings, 1 reply; 4+ messages in thread
From: Ben-Ami Yassour @ 2008-06-04 22:01 UTC (permalink / raw)
  To: avi; +Cc: kvm, Muli Ben-Yehuda, Amit Shah


>> +    if (len > 0) {
>> +        ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
>> +        if (ptr == MAP_FAILED) {
>> +            fprintf(stderr, "create_userspace_phys_mem: %s",
>> +                strerror(errno));
>> +            return 0;
>> +        }
>
> You're using 'len == 0' here to change the semantics of the function.
> It would be better to have two different APIs (perhaps sharing some of
> the implementation by calling a helper).
>

Actually, this is a fix of a bug that is probably exposed only by the direct mmio code.
Here is the problem (in the existing code):
kvm_destroy_phys_mem calls kvm_create_phys_mem(kvm, phys_start, 0, 0, 0);
kvm_create_phys_mem calls kvm_create_userspace_phys_mem which is calling

mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
 		if (ptr == MAP_FAILED)

now, if len = 0 it fails.

We could have sent is as a separate patch,
and we could have made more changes to fix it differently,
for example, not to let kvm_destroy_phys_mem call kvm_create_phys_mem
(which seems strange in general...).
We wanted to minimize the amount of changes that we make, so we choose this option.

What do you recommend?

Thanks,
Ben

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: KVM: PCIPT: direct mmio
  2008-06-04 22:01 ` KVM: PCIPT: direct mmio Ben-Ami Yassour
@ 2008-06-05 13:20   ` Avi Kivity
  0 siblings, 0 replies; 4+ messages in thread
From: Avi Kivity @ 2008-06-05 13:20 UTC (permalink / raw)
  To: Ben-Ami Yassour; +Cc: kvm, Muli Ben-Yehuda, Amit Shah

Ben-Ami Yassour wrote:
>
>>> +    if (len > 0) {
>>> +        ptr = mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 
>>> 0);
>>> +        if (ptr == MAP_FAILED) {
>>> +            fprintf(stderr, "create_userspace_phys_mem: %s",
>>> +                strerror(errno));
>>> +            return 0;
>>> +        }
>>
>> You're using 'len == 0' here to change the semantics of the function.
>> It would be better to have two different APIs (perhaps sharing some of
>> the implementation by calling a helper).
>>
>
> Actually, this is a fix of a bug that is probably exposed only by the 
> direct mmio code.
> Here is the problem (in the existing code):
> kvm_destroy_phys_mem calls kvm_create_phys_mem(kvm, phys_start, 0, 0, 0);
> kvm_create_phys_mem calls kvm_create_userspace_phys_mem which is calling
>
> mmap(NULL, len, prot, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
>         if (ptr == MAP_FAILED)
>
> now, if len = 0 it fails.
>

What's the point of a zero length memslot?  I seem to remember this 
conversation in the past.

> We could have sent is as a separate patch,
> and we could have made more changes to fix it differently,
> for example, not to let kvm_destroy_phys_mem call kvm_create_phys_mem
> (which seems strange in general...).

Yeah.  Perhaps a common helper.

> We wanted to minimize the amount of changes that we make, so we choose 
> this option.
>
> What do you recommend?

Send patches that fix the current broken code and prepare the way for 
the real changes.

This way its easy to review both the fixes to the existing code and the 
new functionality.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2008-06-05 13:20 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <OF02BEEB69.3AC5BC32-ONC225745E.007719ED-C225745E.00772C47@il.ibm.com>
2008-06-04 22:01 ` KVM: PCIPT: direct mmio Ben-Ami Yassour
2008-06-05 13:20   ` Avi Kivity
2008-06-03 11:21 Ben-Ami Yassour
2008-06-04 14:16 ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox