kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Alex Williamson <alex.williamson@redhat.com>
To: kvm@vger.kernel.org
Cc: alex.williamson@redhat.com, ddutile@redhat.com, mst@redhat.com,
	avi@redhat.com, chrisw@redhat.com, jan.kiszka@siemens.com
Subject: [RFC PATCH 2/2] device-assignment: Count required kvm memory slots
Date: Fri, 21 Jan 2011 16:48:42 -0700	[thread overview]
Message-ID: <20110121234831.22262.31177.stgit@s20.home> (raw)
In-Reply-To: <20110121233040.22262.68117.stgit@s20.home>

Each MMIO PCI BAR of an assigned device is directly mapped via a KVM
memory slot to avoid bouncing reads and writes through qemu.  KVM only
provides a (small) fixed number of these slots and attempting to
exceed the unadvertised limit results in an abort.  We can't reserve
slots, but let's at least try to make an attempt to check whether
there are enough available before adding a device.

The non-hotplug case is troublesome here because we have no visibility
as to what else might make use of these slots, but hasn't yet been
mapped.  We used to limit the number of devices that could be specified
on the commandline using the -pcidevice option.  The heuristic here
seems to work and provides a similar limit.

We can also avoid using these memory slots by allowing devices to
bounce mmio access through qemu.  This is trivially accomplished by
adding a force_slow=on option to pci-assign.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/device-assignment.c |   59 +++++++++++++++++++++++++++++++++++++++++++++++-
 hw/device-assignment.h |    3 ++
 2 files changed, 61 insertions(+), 1 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index e97f565..0063a11 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -546,7 +546,9 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
 
-            if (cur_region->size & 0xFFF) {
+            if (pci_dev->features & ASSIGNED_DEVICE_FORCE_SLOW_MASK) {
+                slow_map = 1;
+            } else if (cur_region->size & 0xFFF) {
                 fprintf(stderr, "PCI region %d at address 0x%llx "
                         "has size 0x%x, which is not a multiple of 4K. "
                         "You might experience some performance hit "
@@ -556,6 +558,10 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                 slow_map = 1;
             }
 
+            if (!slow_map) {
+                pci_dev->slots_needed++;
+            }
+
             /* map physical memory */
             pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
             pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
@@ -1666,6 +1672,30 @@ static CPUReadMemoryFunc *msix_mmio_read[] = {
 
 static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
 {
+    int i;
+    PCIRegion *pci_region = dev->real_device.regions;
+
+    /* Determine if the MSI-X table splits a BAR, requiring the use of
+     * two memory slots, one to map each remaining part. */
+    if (!(dev->features & ASSIGNED_DEVICE_FORCE_SLOW_MASK)) {
+        for (i = 0; i < dev->real_device.region_number; i++, pci_region++) {
+            if (!pci_region->valid) {
+                continue;
+            }
+
+            if (ranges_overlap(pci_region->base_addr, pci_region->size,
+                               dev->msix_table_addr, 0x1000)) {
+                target_phys_addr_t offset;
+
+                offset = dev->msix_table_addr - pci_region->base_addr;
+                if (offset && pci_region->size > offset + 0x1000) {
+                    dev->slots_needed++;
+                }
+                break;
+            }
+        }
+    }
+
     dev->msix_table_page = mmap(NULL, 0x1000,
                                 PROT_READ|PROT_WRITE,
                                 MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
@@ -1768,6 +1798,31 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
         if (assigned_dev_register_msix_mmio(dev))
             goto assigned_out;
 
+    if (!(dev->features & ASSIGNED_DEVICE_FORCE_SLOW_MASK)) {
+        int free_slots = kvm_free_slots();
+        int total_slots = dev->slots_needed;
+
+        if (!dev->dev.qdev.hotplugged) {
+            AssignedDevice *adev;
+
+            QLIST_FOREACH(adev, &devs, next) {
+                total_slots += adev->slots_needed;
+            }
+
+            /* This seems to work, but it's completely heuristically
+             * determined.  Any number of things might make use of kvm
+             * memory slots before the guest starts mapping memory BARs.
+             * This is really just a guess. */
+            free_slots -= 13;
+        }
+
+        if (total_slots > free_slots) {
+            error_report("pci-assign: Out of memory slots, need %d, have %d\n",
+                         total_slots, free_slots);
+            goto assigned_out;
+        }
+    }
+
     assigned_dev_load_option_rom(dev);
     QLIST_INSERT_HEAD(&devs, dev, next);
 
@@ -1837,6 +1892,8 @@ static PCIDeviceInfo assign_info = {
                         ASSIGNED_DEVICE_USE_IOMMU_BIT, true),
         DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
                         ASSIGNED_DEVICE_PREFER_MSI_BIT, true),
+        DEFINE_PROP_BIT("force_slow", AssignedDevice, features,
+                        ASSIGNED_DEVICE_FORCE_SLOW_BIT, false),
         DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
         DEFINE_PROP_END_OF_LIST(),
     },
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index c94a730..ad3cc80 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -76,9 +76,11 @@ typedef struct {
 
 #define ASSIGNED_DEVICE_USE_IOMMU_BIT	0
 #define ASSIGNED_DEVICE_PREFER_MSI_BIT	1
+#define ASSIGNED_DEVICE_FORCE_SLOW_BIT	2
 
 #define ASSIGNED_DEVICE_USE_IOMMU_MASK	(1 << ASSIGNED_DEVICE_USE_IOMMU_BIT)
 #define ASSIGNED_DEVICE_PREFER_MSI_MASK	(1 << ASSIGNED_DEVICE_PREFER_MSI_BIT)
+#define ASSIGNED_DEVICE_FORCE_SLOW_MASK	(1 << ASSIGNED_DEVICE_FORCE_SLOW_BIT)
 
 typedef struct AssignedDevice {
     PCIDevice dev;
@@ -111,6 +113,7 @@ typedef struct AssignedDevice {
     int mmio_index;
     int need_emulate_cmd;
     char *configfd_name;
+    int slots_needed;
     QLIST_ENTRY(AssignedDevice) next;
 } AssignedDevice;
 


  parent reply	other threads:[~2011-01-21 23:48 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-21 23:48 [RFC PATCH 0/2] Expose available KVM free memory slot count to help avoid aborts Alex Williamson
2011-01-21 23:48 ` [RFC PATCH 1/2] kvm: Allow querying free slots Alex Williamson
2011-01-21 23:48 ` Alex Williamson [this message]
2011-01-22 22:11 ` [RFC PATCH 0/2] Expose available KVM free memory slot count to help avoid aborts Michael S. Tsirkin
2011-01-24  9:32 ` Marcelo Tosatti
2011-01-24 14:16   ` Jan Kiszka
2011-01-24 15:44     ` Alex Williamson
2011-01-25  5:37       ` Alex Williamson
2011-01-25  7:36         ` Jan Kiszka
2011-01-25 14:41           ` Alex Williamson
2011-01-25 14:45             ` Michael S. Tsirkin
2011-01-25 14:54               ` Avi Kivity
2011-01-25 14:53             ` Avi Kivity
2011-01-25 14:59               ` Michael S. Tsirkin
2011-01-25 17:33                 ` Avi Kivity
2011-01-25 17:58                   ` Michael S. Tsirkin
2011-01-26  9:17                     ` Avi Kivity
2011-01-26  9:20                       ` Michael S. Tsirkin
2011-01-26  9:23                         ` Avi Kivity
2011-01-26  9:39                           ` Michael S. Tsirkin
2011-01-26  9:54                             ` Avi Kivity
2011-01-26 12:08                               ` Michael S. Tsirkin
2011-01-27  9:21                                 ` Avi Kivity
2011-01-27  9:26                                   ` Michael S. Tsirkin
2011-01-27  9:28                                     ` Avi Kivity
2011-01-27  9:29                                       ` Michael S. Tsirkin
2011-01-27  9:51                                         ` Avi Kivity
2011-01-27  9:28                                     ` Michael S. Tsirkin
2011-01-25 16:35               ` Jan Kiszka
2011-01-25 19:13                 ` Alex Williamson
2011-01-26  8:14                   ` Jan Kiszka
2011-01-25 10:23         ` Avi Kivity
2011-01-25 14:57           ` Alex Williamson
2011-01-25 17:11             ` Avi Kivity
2011-01-25 17:43               ` Alex Williamson
2011-01-26  9:22                 ` Avi Kivity
2011-01-31 19:18         ` Marcelo Tosatti
2011-02-23 21:46           ` Alex Williamson
2011-02-24 12:34             ` Avi Kivity
2011-02-24 12:37               ` Avi Kivity
2011-02-24 18:10               ` Alex Williamson
2011-01-25 10:20   ` Avi Kivity
2011-01-25 14:46     ` Alex Williamson
2011-01-25 14:56       ` Avi Kivity
2011-01-25 14:55     ` Michael S. Tsirkin
2011-01-25 14:58       ` Avi Kivity
2011-01-25 15:23         ` Michael S. Tsirkin
2011-01-25 17:34           ` Avi Kivity
2011-01-25 18:00             ` Michael S. Tsirkin
2011-01-26  9:25               ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110121234831.22262.31177.stgit@s20.home \
    --to=alex.williamson@redhat.com \
    --cc=avi@redhat.com \
    --cc=chrisw@redhat.com \
    --cc=ddutile@redhat.com \
    --cc=jan.kiszka@siemens.com \
    --cc=kvm@vger.kernel.org \
    --cc=mst@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).