public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: Alex Williamson <alex.williamson@redhat.com>
To: kvm@vger.kernel.org
Cc: alex.williamson@redhat.com, ddutile@redhat.com, mst@redhat.com,
	chrisw@redhat.com
Subject: [PATCH 5/5] device-assignment: pass through and stub more PCI caps
Date: Fri, 03 Dec 2010 12:34:46 -0700	[thread overview]
Message-ID: <20101203193418.3579.40222.stgit@s20.home> (raw)
In-Reply-To: <20101203192343.3579.73722.stgit@s20.home>

Some drivers depend on finding capabilities like power management,
PCI express/X, vital product data, or vendor specific fields.  Now
that we have better capability support, we can pass more of these
tables through to the guest.  Note that VPD and VNDR are direct pass
through capabilies, the rest are mostly empty shells with a few
writable bits where necessary.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/device-assignment.c |  263 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 256 insertions(+), 7 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 838bf89..2415e43 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -67,6 +67,9 @@ static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
                                                  uint32_t address,
                                                  uint32_t val, int len);
 
+static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
+                                                    uint32_t address, int len);
+
 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
                                        uint32_t addr, int len, uint32_t *val)
 {
@@ -370,11 +373,32 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
     return (uint8_t)assigned_dev_pci_read(d, pos, 1);
 }
 
-static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
+static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
+{
+    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
+    ssize_t ret;
+    int fd = pci_dev->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, pos);
+    if (ret != len) {
+	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+	    goto again;
+
+	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+		__func__, ret, errno);
+
+	exit(1);
+    }
+
+    return;
+}
+
+static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
 {
     int id;
     int max_cap = 48;
-    int pos = PCI_CAPABILITY_LIST;
+    int pos = start ? start : PCI_CAPABILITY_LIST;
     int status;
 
     status = assigned_dev_pci_read_byte(d, PCI_STATUS);
@@ -453,10 +477,16 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
     ssize_t ret;
     AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
 
+    if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
+        val = assigned_device_pci_cap_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
     if (address < 0x4 || (pci_dev->need_emulate_cmd && address == 0x4) ||
 	(address >= 0x10 && address <= 0x24) || address == 0x30 ||
-        address == 0x34 || address == 0x3c || address == 0x3d ||
-        (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address])) {
+        address == 0x34 || address == 0x3c || address == 0x3d) {
         val = pci_default_read_config(d, address, len);
         DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
               (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
@@ -1251,7 +1281,70 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
 #endif
 #endif
 
-static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, uint32_t address,
+/* There can be multiple VNDR capabilities per device, we need to find the
+ * one that starts closet to the given address without going over. */
+static uint8_t find_vndr_start(PCIDevice *pci_dev, uint32_t address)
+{
+    uint8_t cap, pos;
+
+    for (cap = pos = 0;
+         (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
+         pos += PCI_CAP_LIST_NEXT) {
+        if (pos <= address) {
+            cap = MAX(pos, cap);
+        }
+    }
+    return cap;
+}
+
+/* Merge the bits set in mask from mval into val.  Both val and mval are
+ * at the same addr offset, pos is the starting offset of the mask. */
+static uint32_t merge_bits(uint32_t val, uint32_t mval, uint8_t addr,
+                           int len, uint8_t pos, uint32_t mask)
+{
+    if (!ranges_overlap(addr, len, pos, 4)) {
+        return val;
+    }
+
+    if (addr >= pos) {
+        mask >>= (addr - pos) * 8;
+    } else {
+        mask <<= (pos - addr) * 8;
+    }
+    mask &= 0xffffffffU >> (4 - len) * 8;
+
+    val &= ~mask;
+    val |= (mval & mask);
+
+    return val;
+}
+
+static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
+                                                    uint32_t address, int len)
+{
+    uint8_t cap, cap_id = pci_dev->config_map[address];
+    uint32_t val;
+
+    switch (cap_id) {
+
+    case PCI_CAP_ID_VPD:
+        cap = pci_find_capability(pci_dev, cap_id);
+        val = assigned_dev_pci_read(pci_dev, address, len);
+        return merge_bits(val, pci_get_long(pci_dev->config + address),
+                          address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
+
+    case PCI_CAP_ID_VNDR:
+        cap = find_vndr_start(pci_dev, address);
+        val = assigned_dev_pci_read(pci_dev, address, len);
+        return merge_bits(val, pci_get_long(pci_dev->config + address),
+                          address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
+    }
+
+    return pci_default_read_config(pci_dev, address, len);
+}
+
+static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
+                                                 uint32_t address,
                                                  uint32_t val, int len)
 {
     uint8_t cap_id = pci_dev->config_map[address];
@@ -1281,6 +1374,11 @@ static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, uint32_t ad
 #endif
         break;
 #endif
+
+    case PCI_CAP_ID_VPD:
+    case PCI_CAP_ID_VNDR:
+        assigned_dev_pci_write(pci_dev, address, val, len);
+        break;
     }
 }
 
@@ -1300,7 +1398,7 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
 #ifdef KVM_CAP_DEVICE_MSI
     /* Expose MSI capability
      * MSI capability is the 1st capability in capability config */
-    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI))) {
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0))) {
         dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos,
                                       PCI_CAPABILITY_CONFIG_MSI_LENGTH)) < 0) {
@@ -1323,7 +1421,7 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
 #endif
 #ifdef KVM_CAP_DEVICE_MSIX
     /* Expose MSI-X capability */
-    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX))) {
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0))) {
         int bar_nr;
         uint32_t msix_table_entry;
 
@@ -1349,6 +1447,157 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
 #endif
 #endif
 
+    /* Minimal PM support, nothing writable, device appears to NAK changes */
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0))) {
+        uint16_t pmc;
+        if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos,
+                                      PCI_PM_SIZEOF)) < 0) {
+            return ret;
+        }
+
+        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
+        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
+        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
+
+        /* assign_device will bring the device up to D0, so we don't need
+         * to worry about doing that ourselves here. */
+        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
+                     PCI_PM_CTRL_NO_SOFT_RST);
+
+        pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
+        pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0))) {
+        uint8_t version;
+        uint16_t type, devctl, lnkcap, lnksta;
+        uint32_t devcap;
+        int size = 0x3c; /* version 2 size */
+
+        version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
+        version &= PCI_EXP_FLAGS_VERS;
+        if (version == 1) {
+            size = 0x14;
+        } else if (version > 2) {
+            fprintf(stderr, "Unsupported PCI express capability version %d\n",
+                    version);
+            return -EINVAL;
+        }
+
+        if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP,
+                                      pos, size)) < 0) {
+            return ret;
+        }
+
+        type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
+        type = (type & PCI_EXP_FLAGS_TYPE) >> 8;
+        if (type != PCI_EXP_TYPE_ENDPOINT &&
+            type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
+            fprintf(stderr,
+                    "Device assignment only supports endpoint assignment, "
+                    "device type %d\n", type);
+            return -EINVAL;
+        }
+
+        /* capabilities, pass existing read-only copy
+         * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
+
+        /* device capabilities: hide FLR */
+        devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
+        devcap &= ~PCI_EXP_DEVCAP_FLR;
+        pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
+
+        /* device control: clear all error reporting enable bits, leaving
+         *                 leaving only a few host values.  Note, these are
+         *                 all writable, but not passed to hw.
+         */
+        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
+        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
+                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
+        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
+        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
+        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
+
+        /* Clear device status */
+        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
+
+        /* Link capabilities, expose links and latencues, clear reporting */
+        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
+        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
+                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
+                   PCI_EXP_LNKCAP_L1EL);
+        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
+        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
+                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
+                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
+                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
+
+        /* Link control, pass existing read-only copy.  Should be writable? */
+
+        /* Link status, only expose current speed and width */
+        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
+        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
+        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
+
+        if (version >= 2) {
+            /* Slot capabilities, control, status - not needed for endpoints */
+            pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
+            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
+            pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
+
+            /* Root control, capabilities, status - not needed for endpoints */
+            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
+            pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
+            pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
+
+            /* Device capabilities/control 2, pass existing read-only copy */
+            /* Link control 2, pass existing read-only copy */
+        }
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0))) {
+        uint16_t cmd;
+        uint32_t status;
+
+        /* Only expose the minimum, 8 byte capability */
+        if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8)) < 0) {
+            return ret;
+        }
+
+        /* Command register, clear upper bits, including extended modes */
+        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
+        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
+                PCI_X_CMD_MAX_SPLIT);
+        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
+
+        /* Status register, update with emulated PCI bus location, clear
+         * error bits, leave the rest. */
+        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
+        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
+        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
+        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
+                    PCI_X_STATUS_SPL_ERR);
+        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
+    }
+
+    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0))) {
+        /* Direct R/W passthrough */
+        if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8)) < 0) {
+            return ret;
+        }
+    }
+
+    /* Devices can have multiple vendor capabilities, get them all */
+    for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
+        pos += PCI_CAP_LIST_NEXT) {
+        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
+        /* Direct R/W passthrough */
+        if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR,
+                                      pos, len)) < 0) {
+            return ret;
+        }
+    }
+
     return 0;
 }
 


  parent reply	other threads:[~2010-12-03 19:34 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-12-03 19:33 [PATCH 0/5] Extra capabilities for device assignment Alex Williamson
2010-12-03 19:33 ` [PATCH 1/5] device-assignment: Fix off-by-one in header check Alex Williamson
2010-12-03 19:33 ` [PATCH 2/5] pci: MSI-X capability is 12 bytes, not 16, MSI is 10 bytes Alex Williamson
2010-12-03 19:37   ` Chris Wright
2010-12-03 19:48     ` Alex Williamson
2010-12-03 19:54       ` Chris Wright
2010-12-03 19:33 ` [PATCH 3/5] pci: Error on PCI capability collisions Alex Williamson
2010-12-03 19:34 ` [PATCH 4/5] device-assignment: Error checking when adding capabilities Alex Williamson
2010-12-03 19:34 ` Alex Williamson [this message]
2010-12-06 16:21 ` [PATCH v2 0/5] Extra capabilities for device assignment Alex Williamson
2010-12-06 16:22   ` [PATCH v2 1/5] device-assignment: Fix off-by-one in header check Alex Williamson
2010-12-06 16:22   ` [PATCH v2 2/5] pci: Remove PCI_CAPABILITY_CONFIG_* Alex Williamson
2010-12-06 16:22   ` [PATCH v2 3/5] pci: Error on PCI capability collisions Alex Williamson
2010-12-06 16:22   ` [PATCH v2 4/5] device-assignment: Error checking when adding capabilities Alex Williamson
2010-12-06 16:23   ` [PATCH v2 5/5] device-assignment: pass through and stub more PCI caps Alex Williamson
2010-12-06 16:34   ` [PATCH v2 0/5] Extra capabilities for device assignment Avi Kivity
2010-12-06 16:43     ` Alex Williamson
2010-12-06 17:03       ` Avi Kivity
2010-12-09 15:13         ` Markus Armbruster
2010-12-09 15:32           ` Avi Kivity
2010-12-09 16:17             ` Alex Williamson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20101203193418.3579.40222.stgit@s20.home \
    --to=alex.williamson@redhat.com \
    --cc=chrisw@redhat.com \
    --cc=ddutile@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=mst@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox