From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jan Kiszka Subject: Re: [PATCH 12/13] pci-assign: Generic config space access management Date: Tue, 28 Jun 2011 10:18:20 +0200 Message-ID: <4E098E4C.9080901@web.de> References: <007a0322c5a4af8a621375c4d2951eee01f2aa05.1309198794.git.jan.kiszka@web.de> <20110628081016.GG10881@redhat.com> Mime-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="------------enig47767714B30A7EC7FA65DC48" Cc: Avi Kivity , Marcelo Tosatti , kvm@vger.kernel.org, Alex Williamson To: "Michael S. Tsirkin" Return-path: Received: from fmmailgate01.web.de ([217.72.192.221]:53820 "EHLO fmmailgate01.web.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757248Ab1F1ISX (ORCPT ); Tue, 28 Jun 2011 04:18:23 -0400 In-Reply-To: <20110628081016.GG10881@redhat.com> Sender: kvm-owner@vger.kernel.org List-ID: This is an OpenPGP/MIME signed message (RFC 2440 and 3156) --------------enig47767714B30A7EC7FA65DC48 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: quoted-printable On 2011-06-28 10:10, Michael S. Tsirkin wrote: > On Mon, Jun 27, 2011 at 08:19:55PM +0200, Jan Kiszka wrote: >> From: Jan Kiszka >> >> This drastically simplifies config space access management: Instead of= >> coding various range checks and merging bits, set up two access contro= l >> bitmaps. One defines, which bits can be directly read from the device,= >> the other allows direct write to the device, also with bit-granularity= =2E >> >> The setup of those bitmaps, specifically the corner cases like the >> capability list status bit, is way more readable than the existing cod= e. >> And the generic config access functions just need one additional logic= >> to catch writes to the MSI/MSI-X control flags and call the related >> update handlers. >> >> Signed-off-by: Jan Kiszka >> --- >> hw/device-assignment.c | 325 ++++++++++++++++-----------------------= -------- >> hw/device-assignment.h | 3 +- >> 2 files changed, 113 insertions(+), 215 deletions(-) >> >> diff --git a/hw/device-assignment.c b/hw/device-assignment.c >> index 6a2ca8c..be199d2 100644 >> --- a/hw/device-assignment.c >> +++ b/hw/device-assignment.c >> @@ -63,35 +63,6 @@ static void assigned_dev_load_option_rom(AssignedDe= vice *dev); >> =20 >> static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev); >> =20 >> -static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, >> - uint32_t address, >> - uint32_t val, int le= n); >> - >> -static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_de= v, >> - uint32_t address,= int len); >> - >> -/* Merge the bits set in mask from mval into val. Both val and mval = are >> - * at the same addr offset, pos is the starting offset of the mask. *= / >> -static uint32_t merge_bits(uint32_t val, uint32_t mval, uint8_t addr,= >> - int len, uint8_t pos, uint32_t mask) >> -{ >> - if (!ranges_overlap(addr, len, pos, 4)) { >> - return val; >> - } >> - >> - if (addr >=3D pos) { >> - mask >>=3D (addr - pos) * 8; >> - } else { >> - mask <<=3D (pos - addr) * 8; >> - } >> - mask &=3D 0xffffffffU >> (4 - len) * 8; >> - >> - val &=3D ~mask; >> - val |=3D (mval & mask); >> - >> - return val; >> -} >> - >> static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,= >> uint32_t addr, int len, uint32= _t *val) >> { >> @@ -404,143 +375,6 @@ static uint8_t pci_find_cap_offset(PCIDevice *d,= uint8_t cap, uint8_t start) >> return 0; >> } >> =20 >> -static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t addr= ess, >> - uint32_t val, int len) >> -{ >> - int fd; >> - ssize_t ret; >> - AssignedDevice *pci_dev =3D DO_UPCAST(AssignedDevice, dev, d); >> - >> - DEBUG("(%x.%x): address=3D%04x val=3D0x%08x len=3D%d\n", >> - ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), >> - (uint16_t) address, val, len); >> - >> - if (address >=3D PCI_CONFIG_HEADER_SIZE && d->config_map[address]= ) { >> - return assigned_device_pci_cap_write_config(d, address, val, = len); >> - } >> - >> - if (ranges_overlap(address, len, PCI_COMMAND, 2)) { >> - pci_default_write_config(d, address, val, len); >> - /* Continue to program the card */ >> - } >> - >> - /* >> - * Catch access to >> - * - base address registers >> - * - ROM base address & capability pointer >> - * - interrupt line & pin >> - */ >> - if (ranges_overlap(address, len, PCI_BASE_ADDRESS_0, 24) || >> - ranges_overlap(address, len, PCI_ROM_ADDRESS, 4)) { >> - pci_default_write_config(d, address, val, len); >> - return; >> - } else if (ranges_overlap(address, len, PCI_CAPABILITY_LIST, 1) |= | >> - ranges_overlap(address, len, PCI_INTERRUPT_LINE, 2)) {= >> - uint32_t real_val; >> - >> - pci_default_write_config(d, address, val, len); >> - >> - /* Ensure that writes to overlapping areas we don't virtualiz= e still >> - * hit the device. */ >> - real_val =3D assigned_dev_pci_read(d, address, len); >> - val =3D merge_bits(val, real_val, address, len, >> - PCI_CAPABILITY_LIST, 0xff); >> - val =3D merge_bits(val, real_val, address, len, >> - PCI_INTERRUPT_LINE, 0xffff); >> - } >> - >> - DEBUG("NON BAR (%x.%x): address=3D%04x val=3D0x%08x len=3D%d\n", >> - ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), >> - (uint16_t) address, val, len); >> - >> - fd =3D pci_dev->real_device.config_fd; >> - >> -again: >> - ret =3D pwrite(fd, &val, len, address); >> - if (ret !=3D len) { >> - if ((ret < 0) && (errno =3D=3D EINTR || errno =3D=3D EAGAIN)) >> - goto again; >> - >> - fprintf(stderr, "%s: pwrite failed, ret =3D %zd errno =3D %d\n", >> - __func__, ret, errno); >> - >> - exit(1); >> - } >> -} >> - >> -static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t a= ddress, >> - int len) >> -{ >> - uint32_t val =3D 0, virt_val; >> - int fd; >> - ssize_t ret; >> - AssignedDevice *pci_dev =3D DO_UPCAST(AssignedDevice, dev, d); >> - >> - if (address >=3D PCI_CONFIG_HEADER_SIZE && d->config_map[address]= ) { >> - val =3D assigned_device_pci_cap_read_config(d, address, len);= >> - DEBUG("(%x.%x): address=3D%04x val=3D0x%08x len=3D%d\n", >> - (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val,= len); >> - return val; >> - } >> - >> - /* >> - * Catch access to >> - * - vendor & device ID >> - * - base address registers >> - * - ROM base address >> - */ >> - if (ranges_overlap(address, len, PCI_VENDOR_ID, 4) || >> - ranges_overlap(address, len, PCI_BASE_ADDRESS_0, 24) || >> - ranges_overlap(address, len, PCI_ROM_ADDRESS, 4)) { >> - val =3D pci_default_read_config(d, address, len); >> - DEBUG("(%x.%x): address=3D%04x val=3D0x%08x len=3D%d\n", >> - (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val,= len); >> - return val; >> - } >> - >> - fd =3D pci_dev->real_device.config_fd; >> - >> -again: >> - ret =3D pread(fd, &val, len, address); >> - if (ret !=3D len) { >> - if ((ret < 0) && (errno =3D=3D EINTR || errno =3D=3D EAGAIN)) >> - goto again; >> - >> - fprintf(stderr, "%s: pread failed, ret =3D %zd errno =3D %d\n", >> - __func__, ret, errno); >> - >> - exit(1); >> - } >> - >> - DEBUG("(%x.%x): address=3D%04x val=3D0x%08x len=3D%d\n", >> - (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len= ); >> - >> - if (pci_dev->emulate_cmd_mask) { >> - val =3D merge_bits(val, pci_default_read_config(d, address, l= en), >> - address, len, PCI_COMMAND, pci_dev->emulate_= cmd_mask); >> - } >> - >> - /* >> - * Merge bits from virtualized >> - * - capability pointer >> - * - interrupt line & pin >> - */ >> - virt_val =3D pci_default_read_config(d, address, len); >> - val =3D merge_bits(val, virt_val, address, len, PCI_CAPABILITY_LI= ST, 0xff); >> - val =3D merge_bits(val, virt_val, address, len, PCI_INTERRUPT_LIN= E, 0xffff); >> - >> - if (!pci_dev->cap.available) { >> - /* kill the special capabilities */ >> - if (address =3D=3D PCI_COMMAND && len =3D=3D 4) { >> - val &=3D ~(PCI_STATUS_CAP_LIST << 16); >> - } else if (address =3D=3D PCI_STATUS) { >> - val &=3D ~PCI_STATUS_CAP_LIST; >> - } >> - } >> - >> - return val; >> -} >> - >> static int assigned_dev_register_regions(PCIRegion *io_regions, >> unsigned long regions_num, >> AssignedDevice *pci_dev) >> @@ -790,7 +624,8 @@ again: >> /* dealing with virtual function device */ >> snprintf(name, sizeof(name), "%sphysfn/", dir); >> if (!stat(name, &statbuf)) { >> - pci_dev->emulate_cmd_mask =3D 0xffff; >> + /* always provide the written value on readout */ >> + memset(pci_dev->direct_config_read + PCI_COMMAND, 0xff, 2); >> } >> =20 >> dev->region_number =3D r; >> @@ -1268,73 +1103,81 @@ static void assigned_dev_update_msix(PCIDevice= *pci_dev) >> } >> } >> =20 >> -/* There can be multiple VNDR capabilities per device, we need to fin= d the >> - * one that starts closet to the given address without going over. */= >> -static uint8_t find_vndr_start(PCIDevice *pci_dev, uint32_t address) >> +static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev, >> + uint32_t address, int le= n) >> { >> - uint8_t cap, pos; >> + AssignedDevice *assigned_dev =3D DO_UPCAST(AssignedDevice, dev, p= ci_dev); >> + uint32_t virt_val =3D pci_default_read_config(pci_dev, address, l= en); >> + uint32_t real_val, direct_mask; >> =20 >> - for (cap =3D pos =3D 0; >> - (pos =3D pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos))= ; >> - pos +=3D PCI_CAP_LIST_NEXT) { >> - if (pos <=3D address) { >> - cap =3D MAX(pos, cap); >> - } >> + switch (len) { >> + case 4: >> + direct_mask =3D >> + pci_get_long(assigned_dev->direct_config_read + address);= >> + break; >> + case 2: >> + direct_mask =3D >> + pci_get_word(assigned_dev->direct_config_read + address);= >> + break; >> + case 1: >> + direct_mask =3D >> + pci_get_byte(assigned_dev->direct_config_read + address);= >> + break; >> + default: >> + abort(); >> } >> - return cap; >> -} >> - >> -static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_de= v, >> - uint32_t address,= int len) >> -{ >> - uint8_t cap, cap_id =3D pci_dev->config_map[address]; >> - uint32_t val; >> - >> - switch (cap_id) { >> - >> - case PCI_CAP_ID_VPD: >> - cap =3D pci_find_capability(pci_dev, cap_id); >> - val =3D assigned_dev_pci_read(pci_dev, address, len); >> - return merge_bits(val, pci_get_long(pci_dev->config + address= ), >> - address, len, cap + PCI_CAP_LIST_NEXT, 0xff= ); >> - >> - case PCI_CAP_ID_VNDR: >> - cap =3D find_vndr_start(pci_dev, address); >> - val =3D assigned_dev_pci_read(pci_dev, address, len); >> - return merge_bits(val, pci_get_long(pci_dev->config + address= ), >> - address, len, cap + PCI_CAP_LIST_NEXT, 0xff= ); >> + if (direct_mask) { >> + real_val =3D assigned_dev_pci_read(pci_dev, address, len); >> + return (virt_val & ~direct_mask) | (real_val & direct_mask); >> + } else { >> + return virt_val; >> } >> - >> - return pci_default_read_config(pci_dev, address, len); >> } >> =20 >> -static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, >> - uint32_t address, >> - uint32_t val, int le= n) >> +static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_= t address, >> + uint32_t val, int len) >> { >> - uint8_t cap_id =3D pci_dev->config_map[address]; >> + AssignedDevice *assigned_dev =3D DO_UPCAST(AssignedDevice, dev, p= ci_dev); >> + uint32_t direct_mask, full_access; >> =20 >> pci_default_write_config(pci_dev, address, val, len); >> - switch (cap_id) { >> - case PCI_CAP_ID_MSI: >> + >> + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { >> if (range_covers_byte(address, len, >> pci_dev->msi_cap + PCI_MSI_FLAGS)) { >> assigned_dev_update_msi(pci_dev); >> } >> - break; >> - >> - case PCI_CAP_ID_MSIX: >> + } else if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX= ) { >> if (range_covers_byte(address, len, >> pci_dev->msix_cap + PCI_MSIX_FLAGS + 1)= ) { >> assigned_dev_update_msix(pci_dev); >> } >> - break; >> + } >> =20 >> - case PCI_CAP_ID_VPD: >> - case PCI_CAP_ID_VNDR: >> - assigned_dev_pci_write(pci_dev, address, val, len); >> + switch (len) { >> + case 4: >> + direct_mask =3D >> + pci_get_long(assigned_dev->direct_config_write + address)= ; >> + full_access =3D 0xffffffff; >> + break; >> + case 2: >> + direct_mask =3D >> + pci_get_word(assigned_dev->direct_config_write + address)= ; >> + full_access =3D 0xffff; >> break; >> + case 1: >> + direct_mask =3D >> + pci_get_byte(assigned_dev->direct_config_write + address)= ; >> + full_access =3D 0xff; >> + break; >> + default: >> + abort(); >> + } >> + if (direct_mask !=3D full_access) { >> + val &=3D direct_mask; >> + val |=3D assigned_dev_pci_read(pci_dev, address, len) & ~dire= ct_mask; >> } >> + assigned_dev_pci_write(pci_dev, address, val, len); >=20 > I think we should not write at all for fully virtualized fields > (direct_mask =3D=3D 0). E.g. PCI BARs. Yes, Alex caught that as well. Fixed already. >=20 >> } >> =20 >> static int assigned_device_pci_cap_init(PCIDevice *pci_dev) >> @@ -1404,6 +1247,10 @@ static int assigned_device_pci_cap_init(PCIDevi= ce *pci_dev) >> return ret; >> } >> =20 >> + /* direct read except for next cap pointer */ >> + memset(dev->direct_config_read + pos, 0xff, PCI_PM_SIZEOF); >> + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] =3D 0; >> + >> pmc =3D pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS); >> pmc &=3D (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI); >> pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc); >> @@ -1438,6 +1285,10 @@ static int assigned_device_pci_cap_init(PCIDevi= ce *pci_dev) >> return ret; >> } >> =20 >> + /* direct read except for next cap pointer */ >> + memset(dev->direct_config_read + pos, 0xff, size); >> + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] =3D 0; >> + >> type =3D pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);= >> type =3D (type & PCI_EXP_FLAGS_TYPE) >> 8; >> if (type !=3D PCI_EXP_TYPE_ENDPOINT && >> @@ -1513,6 +1364,10 @@ static int assigned_device_pci_cap_init(PCIDevi= ce *pci_dev) >> return ret; >> } >> =20 >> + /* direct read except for next cap pointer */ >> + memset(dev->direct_config_read + pos, 0xff, 8); >> + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] =3D 0; >> + >> /* Command register, clear upper bits, including extended mod= es */ >> cmd =3D pci_get_word(pci_dev->config + pos + PCI_X_CMD); >> cmd &=3D (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_R= EAD | >> @@ -1534,6 +1389,13 @@ static int assigned_device_pci_cap_init(PCIDevi= ce *pci_dev) >> if ((ret =3D pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos,= 8)) < 0) { >> return ret; >> } >> + >> + /* direct read except for next cap pointer */ >> + memset(dev->direct_config_read + pos, 0xff, 8); >> + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] =3D 0; >> + >> + /* direct write for cap content */ >> + memset(dev->direct_config_write + pos + 2, 0xff, 6); >> } >> =20 >> /* Devices can have multiple vendor capabilities, get them all */= >> @@ -1545,6 +1407,21 @@ static int assigned_device_pci_cap_init(PCIDevi= ce *pci_dev) >> pos, len)) < 0) { >> return ret; >> } >> + >> + /* direct read except for next cap pointer */ >> + memset(dev->direct_config_read + pos, 0xff, len); >> + dev->direct_config_read[pos + PCI_CAP_LIST_NEXT] =3D 0; >=20 > Add a function for the above 3 lines? >=20 >> + >> + /* direct write for cap content */ >> + memset(dev->direct_config_write + pos + 2, 0xff, len - 2); >> + } >> + >> + /* If real and virtual capability list status bits differ, virtua= lize the >> + * access. */ >> + if ((pci_get_word(pci_dev->config + PCI_STATUS) & PCI_STATUS_CAP_= LIST) !=3D >> + (assigned_dev_pci_read_byte(pci_dev, PCI_STATUS) & >> + PCI_STATUS_CAP_LIST)) { >> + dev->direct_config_read[PCI_STATUS] &=3D ~PCI_STATUS_CAP_LIST= ; >> } >=20 > Is this an optimization? What harm does it do to virtualize always? Just replicating the old behavior, but I can virtualize it unconditionall= y. >=20 >> =20 >> return 0; >> @@ -1694,6 +1571,26 @@ static int assigned_initfn(struct PCIDevice *pc= i_dev) >> return -1; >> } >> =20 >> + /* >> + * Set up basic config space access control. Will be further refi= ned during >> + * device initialization. >> + * >> + * Direct read/write access is grangted for: >=20 > typo >=20 >> + * - status & command registers, revision ID & class code, cache= line size, >> + * latency timer, header type, BIST >> + * - cardbus CIS pointer, subsystem vendor & ID >> + * - reserved fields >> + * - Min_Gnt & Max_Lat >> + */ >> + memset(dev->direct_config_read + PCI_COMMAND, 0xff, 12); >> + memset(dev->direct_config_read + PCI_CARDBUS_CIS, 0xff, 8); >> + memset(dev->direct_config_read + PCI_CAPABILITY_LIST + 1, 0xff, 7= ); >> + memset(dev->direct_config_read + PCI_MIN_GNT, 0xff, 2); >> + memset(dev->direct_config_write + PCI_COMMAND, 0xff, 12); >> + memset(dev->direct_config_write + PCI_CARDBUS_CIS, 0xff, 8); >> + memset(dev->direct_config_write + PCI_CAPABILITY_LIST + 1, 0xff, = 7); >> + memset(dev->direct_config_write + PCI_MIN_GNT, 0xff, 2); >> + >> if (get_real_device(dev, dev->host.seg, dev->host.bus, >> dev->host.dev, dev->host.func)) { >> error_report("pci-assign: Error: Couldn't get real device (%s= )!", >> diff --git a/hw/device-assignment.h b/hw/device-assignment.h >> index 3d20207..ed5defb 100644 >> --- a/hw/device-assignment.h >> +++ b/hw/device-assignment.h >> @@ -104,12 +104,13 @@ typedef struct AssignedDevice { >> #define ASSIGNED_DEVICE_MSIX_MASKED (1 << 2) >> uint32_t state; >> } cap; >> + uint8_t direct_config_read[PCI_CONFIG_SPACE_SIZE]; >> + uint8_t direct_config_write[PCI_CONFIG_SPACE_SIZE]; >=20 > What direct means isn't obvious. Add a comment? Ok. >=20 > We could revert the logic, have bits for > emulate_config_read/emulate_config_write. I think that most of the > registers can safely be read/written directly, so protecting relevant > bits will be less work. Is that true? I intentionally chose whitelisting to avoid accidentally granting access to forgotten fields. I don't think it's a good idea to switch to blacklisting. Jan --------------enig47767714B30A7EC7FA65DC48 Content-Type: application/pgp-signature; name="signature.asc" Content-Description: OpenPGP digital signature Content-Disposition: attachment; filename="signature.asc" -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.15 (GNU/Linux) Comment: Using GnuPG with SUSE - http://enigmail.mozdev.org/ iEYEARECAAYFAk4JjkwACgkQitSsb3rl5xS8lACaAztomr7ZchjXxbBW5K9ZT2gX jawAnA8aIku+QN45AAH47g7a9YJnD+bE =aKAF -----END PGP SIGNATURE----- --------------enig47767714B30A7EC7FA65DC48--