qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Alex Williamson <alex.williamson@redhat.com>
To: "Michael S. Tsirkin" <mst@redhat.com>
Cc: chrisw@redhat.com, qemu-devel@nongnu.org, kvm@vger.kernel.org
Subject: [Qemu-devel] Re: [RFC PATCH 8/8] device-assignment: pass through and stub more PCI caps
Date: Fri, 12 Nov 2010 08:42:38 -0700	[thread overview]
Message-ID: <1289576558.2805.98.camel@x201> (raw)
In-Reply-To: <20101112091146.GG7631@redhat.com>

On Fri, 2010-11-12 at 11:11 +0200, Michael S. Tsirkin wrote:
> On Thu, Nov 11, 2010 at 11:30:07PM -0700, Alex Williamson wrote:
> > On Fri, 2010-11-12 at 07:36 +0200, Michael S. Tsirkin wrote:
> > > On Thu, Nov 11, 2010 at 07:56:46PM -0700, Alex Williamson wrote:
> > > > Some drivers depend on finding capabilities like power management,
> > > > PCI express/X, vital product data, or vendor specific fields.  Now
> > > > that we have better capability support, we can pass more of these
> > > > tables through to the guest.  Note that VPD and VNDR are direct pass
> > > > through capabilies, the rest are mostly empty shells with a few
> > > > writable bits where necessary.
> > > > 
> > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > > ---
> > > > 
> > > >  hw/device-assignment.c |  160 +++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 files changed, 149 insertions(+), 11 deletions(-)
> > > > 
> > > > diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> > > > index 179c7dc..1b228ad 100644
> > > > --- a/hw/device-assignment.c
> > > > +++ b/hw/device-assignment.c
> > > > @@ -366,6 +366,27 @@ static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
> > > >      return (uint8_t)assigned_dev_pci_read(d, pos, 1);
> > > >  }
> > > >  
> > > > +static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
> > > > +{
> > > > +    AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
> > > > +    ssize_t ret;
> > > > +    int fd = pci_dev->real_device.config_fd;
> > > > +
> > > > +again:
> > > > +    ret = pwrite(fd, &val, len, pos);
> > > > +    if (ret != len) {
> > > > +	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
> > > > +	    goto again;
> > > 
> > > 
> > > do {} while() ?
> > 
> > Sure, this is just a copy of another place that does something similar.
> > They should either be merged or both converted in a separate patch.
> > 
> > > > +
> > > > +	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
> > > > +		__func__, ret, errno);
> > > > +
> > > > +	exit(1);
> > > > +    }
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > >  static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap)
> > > >  {
> > > >      int id;
> > > > @@ -1244,37 +1265,75 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
> > > >  #endif
> > > >  #endif
> > > >  
> > > > +static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
> > > > +                                                    uint8_t cap_id,
> > > > +                                                    uint32_t address, int len)
> > > > +{
> > > > +    uint8_t cap;
> > > > +
> > > > +    switch (cap_id) {
> > > > +
> > > > +    case PCI_CAP_ID_VPD:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > > +        }
> > > > +        break;
> > > > +
> > > > +    case PCI_CAP_ID_VNDR:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > > +            return assigned_dev_pci_read(pci_dev, address, len);
> > > > +        }
> > > > +        break;
> > > > +    }
> > > > +
> > > > +    return pci_default_cap_read_config(pci_dev, cap_id, address, len);
> > > > +}
> > > > +
> > > >  static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
> > > >                                                   uint8_t cap_id,
> > > >                                                   uint32_t address,
> > > >                                                   uint32_t val, int len)
> > > >  {
> > > > +    uint8_t cap;
> > > > +
> > > >      pci_default_cap_write_config(pci_dev, cap_id, address, val, len);
> > > >  
> > > >      switch (cap_id) {
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > >      case PCI_CAP_ID_MSI:
> > > >  #ifdef KVM_CAP_DEVICE_MSI
> > > > -        {
> > > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > > -            if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > > -                assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > > -            }
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
> > > > +            assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
> > > >          }
> > > >  #endif
> > > >          break;
> > > >  
> > > >      case PCI_CAP_ID_MSIX:
> > > >  #ifdef KVM_CAP_DEVICE_MSIX
> > > > -        {
> > > > -            uint8_t cap = pci_find_capability(pci_dev, cap_id);
> > > > -            if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > > -                assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > > -            }
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
> > > > +            assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
> > > >          }
> > > >  #endif
> > > >          break;
> > > >  #endif
> > > > +
> > > > +    case PCI_CAP_ID_VPD:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap >= PCI_CAP_FLAGS) {
> > > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > > +        }
> > > > +        break;
> > > > +
> > > > +    case PCI_CAP_ID_VNDR:
> > > > +        cap = pci_find_capability(pci_dev, cap_id);
> > > > +        if (address - cap > PCI_CAP_FLAGS) {
> > > > +            assigned_dev_pci_write(pci_dev, address, val, len);
> > > > +        }
> > > > +        break;
> > > 
> > > I have a feeling we should use overlap functions instead of
> > > address math. What do you think?
> > 
> > if (!ranges_overlap(address - cap, len, 0, PCI_CAP_FLAGS)) ?
> 
> 	ranges_overlap(address, len, cap, PCI_CAP_FLAGS)
> 
> > Sure, that'd be a nice cleanup.
> > 
> > > Also - put cap offsets in assigned device structure to avoid
> > > find calls?
> > 
> > I suppose there aren't enough capability IDs that it'd take much space
> > to do so, but it doesn't sound like a unique to device assignment issue.
> > Maybe that should live on PCIDevice with an access function.
> 
> Sure, I put all caps that we actually emulate in PCIDevice.
> So that would apply to express, pcix, etc.
> Sticking offsets to caps that core doesn't emulate in PCIDevice
> seems a bit strange. That's why each device has its own device state.

The counter argument is that instead of sprinkling cap_msi, cap_msix,
cap_pcie, cap_foo into PCIDevice as support gets added, it would add a
lot of consistency to have a uint8_t caps[PCI_CAP_ID_MAX], then
pci_find_capability simply becomes return pdev->caps[cap_id], and we can
make more use of it.

> > > >      }
> > > >      return;
> > > >  }
> > > > @@ -1340,6 +1399,84 @@ static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
> > > >  #endif
> > > >  #endif
> > > >  
> > > > +    /* Minimal PM support, make the state bits writable so the guest
> > > > +     * thinks it's doing something. */
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM))) {
> > > > +        uint16_t pmc, pmcsr;
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PM, pos,
> > > > +                                     PCI_PM_SIZEOF);
> > > > +
> > > > +        pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
> > > > +        pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
> > > > +
> > > > +        pmcsr = pci_get_word(pci_dev->config + pos + PCI_PM_CTRL);
> > > > +        pmcsr &= (PCI_PM_CTRL_STATE_MASK);
> > > > +        pmcsr |= PCI_PM_CTRL_NO_SOFT_RST;
> > > > +        pci_set_word(pci_dev->config + pos + PCI_PM_CTRL, pmcsr);
> > > > +
> > > > +        pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
> > > > +                     PCI_PM_CTRL_STATE_MASK);
> > > > +    }
> > > 
> > > we don't pass anything to device. So - can this be put in pci_pm.c
> > > so that emulated devices can use this too?
> > 
> > That's part of why this one is an RFC, should we allow the guest to do
> > some level of power management to the physical device?  Xen seems to
> > allow D-state manipulation by the guest.
> 
> Hmm, ok. I still hope we can do the emulated one and then
> assigned device would call the relevant function and pass
> info to the backend.

I hope so too, but I don't want generalizing an interface to gate
supporting the capability for assigned devices.

> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP))) {
> > > > +        uint16_t devctl, lnkcap, lnksta;
> > > > +
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_EXP, pos, 0x40);
> > > > +
> > > > +        devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
> > > > +        devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
> > > > +                  PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
> > > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
> > > > +        devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
> > > > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
> > > > +
> > > > +        lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
> > > > +        lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
> > > > +                   PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
> > > > +                   PCI_EXP_LNKCAP_L1EL);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
> > > > +
> > > > +        pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
> > > > +                     PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
> > > > +                     PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
> > > > +                     PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
> > > > +        
> > > > +        lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
> > > > +        lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
> > > > +
> > > > +        pci_set_word(pci_dev->wmask + pos + 0x28, 0x1f);
> > > > +        pci_set_word(pci_dev->wmask + pos + 0x30, 0xfbf);
> > > 
> > > This seems to overlap functionally with the express work upstream.
> > > Can code from there be reused?  I also wonder whether is affects the
> > > guest OS if it finds an express device on a non-express bridge.
> > 
> > Yes, perhaps it can be merged.  I'd like to start with figuring out what
> > we need for device assignment, and merging where it makes sense later
> > than stall out trying to solve the whole problem upfront.
> > 
> > It could certainly be confusing for a driver to find an express device
> > under a PIIX chipset, but I think typically the driver is just looking
> > for link speed info for pretty printks.
> > > > +    }
> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD))) {
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VPD, pos, 0x8);
> > > > +    }
> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR))) {
> > > > +        uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_VNDR, pos, len);
> > > > +    }
> > > > +
> > > > +    if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX))) {
> > > > +        uint16_t cmd;
> > > > +        uint32_t status;
> > > > +
> > > > +        pci_add_capability_at_offset(pci_dev, PCI_CAP_ID_PCIX, pos, 0x8);
> > > > +
> > > > +        cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
> > > > +        cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
> > > > +                PCI_X_CMD_MAX_SPLIT);
> > > > +        pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
> > > > +
> > > > +        status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
> > > > +        status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
> > > > +        status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
> > > > +        status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
> > > > +                    PCI_X_STATUS_SPL_ERR);
> > > > +        pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
> > > > +    }
> > > 
> > > This will be handy for non-assignment case so
> > > I'd like to see this moved out of device-assignment.c:
> > > we could create pcix.c or just add to pci.c.
> > 
> > Yeah, I'm not sure there's enough we can poke at in a PCIX capability to
> > warrant it's own file.  Here for the same reason as the others, what do
> > we want to expose, what's emulated, what's poke-able.  We also have the
> > benefit here that we get default from hardware.  If it stays this small,
> > I'd just assume leave it here than try to generalize an interface when
> > we're the only user.
> 
> Do you have a pcix device btw? Do we even care?

I don't have one, but I know of VT-d capable systems that support PCI-e
to PCI-X bridges in their topology.

Alex

> > > >      return 0;
> > > >  }
> > > >  
> > > > @@ -1466,7 +1603,8 @@ static int assigned_initfn(struct PCIDevice *pci_dev)
> > > >      dev->h_busnr = dev->host.bus;
> > > >      dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
> > > >  
> > > > -    pci_register_capability_handlers(pci_dev, NULL,
> > > > +    pci_register_capability_handlers(pci_dev,
> > > > +                                     assigned_device_pci_cap_read_config,
> > > >                                       assigned_device_pci_cap_write_config);
> > > 
> > > Maybe these could go away?
> > 
> > Sure, pass a capability ID to the read/write_config functions and I'd
> > support this going way.  I don't think that necessarily needs to be tied
> > to this series though.  Thanks,
> > 
> > Alex
> 
> As in, can be a separate cleanup later. Yes.
> 
> > > >  
> > > >      if (assigned_device_pci_cap_init(pci_dev) < 0)
> > 
> > 

  reply	other threads:[~2010-11-12 15:42 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-11-12  2:54 [Qemu-devel] [PATCH 0/8] PCI capability and device assignment improvements Alex Williamson
2010-11-12  2:55 ` [Qemu-devel] [PATCH 1/8] pci: pci_default_cap_write_config ignores wmask Alex Williamson
2010-11-12  5:22   ` [Qemu-devel] " Michael S. Tsirkin
2010-11-12  6:03     ` Alex Williamson
2010-11-12  8:48       ` Michael S. Tsirkin
2010-11-12 15:49         ` Alex Williamson
2010-11-16 16:12           ` Michael S. Tsirkin
2010-11-12  2:55 ` [Qemu-devel] [PATCH 2/8] pci: Remove pci_enable_capability_support() Alex Williamson
2010-11-12  2:55 ` [Qemu-devel] [PATCH 3/8] device-assignment: Use PCI capabilities support Alex Williamson
2010-11-12  2:55 ` [Qemu-devel] [PATCH 4/8] pci: Replace used bitmap with capability byte map Alex Williamson
2010-11-12  5:40   ` [Qemu-devel] " Michael S. Tsirkin
2010-11-12  6:07     ` Alex Williamson
2010-11-12  9:02       ` Michael S. Tsirkin
2010-11-12 15:32         ` Alex Williamson
2010-11-12  2:55 ` [Qemu-devel] [PATCH 5/8] pci: Remove cap.length, cap.start, cap.supported Alex Williamson
2010-11-12  2:56 ` [Qemu-devel] [PATCH 6/8] device-assignment: Move PCI capabilities to match physical hardware Alex Williamson
2010-11-12  9:20   ` [Qemu-devel] " Michael S. Tsirkin
2010-11-12 13:53     ` Alex Williamson
2010-11-12  2:56 ` [Qemu-devel] [PATCH 7/8] pci: Pass ID for capability read/write handlers Alex Williamson
2010-11-12  2:56 ` [Qemu-devel] [RFC PATCH 8/8] device-assignment: pass through and stub more PCI caps Alex Williamson
2010-11-12  5:36   ` [Qemu-devel] " Michael S. Tsirkin
2010-11-12  6:30     ` Alex Williamson
2010-11-12  9:11       ` Michael S. Tsirkin
2010-11-12 15:42         ` Alex Williamson [this message]
2010-11-16 16:08           ` Michael S. Tsirkin
2010-11-12  5:39 ` [Qemu-devel] Re: [PATCH 0/8] PCI capability and device assignment improvements Michael S. Tsirkin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1289576558.2805.98.camel@x201 \
    --to=alex.williamson@redhat.com \
    --cc=chrisw@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).