* [Qemu-devel] [PATCH qemu v11 01/11] vmstate: Define VARRAY with VMS_ALLOC
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
@ 2015-07-15 9:44 ` Alexey Kardashevskiy
2015-07-15 9:44 ` [Qemu-devel] [PATCH qemu v11 02/11] spapr_pci: Convert finish_realize() to dma_capabilities_update()+dma_init_window() Alexey Kardashevskiy
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:44 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
This allows dynamic allocation for migrating arrays.
Already existing VMSTATE_VARRAY_UINT32 requires an array to be
pre-allocated, however there are cases when the size is not known in
advance and there is no real need to enforce it.
This defines another variant of VMSTATE_VARRAY_UINT32 with WMS_ALLOC
flag which tells the receiving side to allocate memory for the array
before receiving the data.
The first user of it is a dynamic DMA window which existence and size
are totally dynamic.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
include/migration/vmstate.h | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 0695d7c..5881d9f 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -295,6 +295,16 @@ extern const VMStateInfo vmstate_info_bitmap;
.offset = vmstate_offset_pointer(_state, _field, _type), \
}
+#define VMSTATE_VARRAY_UINT32_ALLOC(_field, _state, _field_num, _version, _info, _type) {\
+ .name = (stringify(_field)), \
+ .version_id = (_version), \
+ .num_offset = vmstate_offset_value(_state, _field_num, uint32_t),\
+ .info = &(_info), \
+ .size = sizeof(_type), \
+ .flags = VMS_VARRAY_UINT32|VMS_POINTER|VMS_ALLOC, \
+ .offset = vmstate_offset_pointer(_state, _field, _type), \
+}
+
#define VMSTATE_VARRAY_UINT16_UNSAFE(_field, _state, _field_num, _version, _info, _type) {\
.name = (stringify(_field)), \
.version_id = (_version), \
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 02/11] spapr_pci: Convert finish_realize() to dma_capabilities_update()+dma_init_window()
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
2015-07-15 9:44 ` [Qemu-devel] [PATCH qemu v11 01/11] vmstate: Define VARRAY with VMS_ALLOC Alexey Kardashevskiy
@ 2015-07-15 9:44 ` Alexey Kardashevskiy
2015-07-15 9:44 ` [Qemu-devel] [PATCH qemu v11 03/11] spapr_iommu: Move table allocation to helpers Alexey Kardashevskiy
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:44 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
This reworks finish_realize() which used to finalize DMA setup with
an assumption that it will not change later.
New callbacks supports various window parameters such as page and
windows sizes. The new callback return error code rather than Error**.
This is a mechanical change so no change in behaviour is expected.
This is a part of getting rid of spapr-pci-vfio-host-bridge type.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
---
Changes:
v8:
* moved spapr_phb_dma_capabilities_update() higher to avoid forward
declaration in following patches and keep DMA code together (i.e. next
to spapr_pci_dma_iommu())
---
hw/ppc/spapr_pci.c | 59 ++++++++++++++++++++++++++-------------------
hw/ppc/spapr_pci_vfio.c | 53 ++++++++++++++++------------------------
include/hw/pci-host/spapr.h | 8 +++++-
3 files changed, 62 insertions(+), 58 deletions(-)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index cfd3b7b..f302e92 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -810,6 +810,28 @@ static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev)
return buf;
}
+static int spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb)
+{
+ sphb->dma32_window_start = 0;
+ sphb->dma32_window_size = SPAPR_PCI_DMA32_SIZE;
+
+ return 0;
+}
+
+static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t liobn, uint32_t page_shift,
+ uint64_t window_size)
+{
+ uint64_t bus_offset = sphb->dma32_window_start;
+ sPAPRTCETable *tcet;
+
+ tcet = spapr_tce_new_table(DEVICE(sphb), liobn, bus_offset, page_shift,
+ window_size >> page_shift,
+ false);
+
+ return tcet ? 0 : -1;
+}
+
/* Macros to operate with address in OF binding to PCI */
#define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p))
#define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */
@@ -1222,6 +1244,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
int i;
PCIBus *bus;
uint64_t msi_window_size = 4096;
+ sPAPRTCETable *tcet;
if (sphb->index != (uint32_t)-1) {
hwaddr windows_base;
@@ -1371,33 +1394,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
}
}
- if (!info->finish_realize) {
- error_setg(errp, "finish_realize not defined");
- return;
- }
-
- info->finish_realize(sphb, errp);
-
- sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
-}
-
-static void spapr_phb_finish_realize(sPAPRPHBState *sphb, Error **errp)
-{
- sPAPRTCETable *tcet;
- uint32_t nb_table;
-
- nb_table = SPAPR_PCI_DMA32_SIZE >> SPAPR_TCE_PAGE_SHIFT;
- tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn,
- 0, SPAPR_TCE_PAGE_SHIFT, nb_table, false);
+ info->dma_capabilities_update(sphb);
+ info->dma_init_window(sphb, sphb->dma_liobn, SPAPR_TCE_PAGE_SHIFT,
+ sphb->dma32_window_size);
+ tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
if (!tcet) {
- error_setg(errp, "Unable to create TCE table for %s",
- sphb->dtbusname);
- return ;
+ error_setg(errp, "failed to create TCE table");
+ return;
}
-
- /* Register default 32bit DMA window */
- memory_region_add_subregion(&sphb->iommu_root, 0,
+ memory_region_add_subregion(&sphb->iommu_root, tcet->bus_offset,
spapr_tce_get_iommu(tcet));
+
+ sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
}
static int spapr_phb_children_reset(Object *child, void *opaque)
@@ -1545,9 +1553,10 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
dc->vmsd = &vmstate_spapr_pci;
set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
dc->cannot_instantiate_with_device_add_yet = false;
- spc->finish_realize = spapr_phb_finish_realize;
hp->plug = spapr_phb_hot_plug_child;
hp->unplug = spapr_phb_hot_unplug_child;
+ spc->dma_capabilities_update = spapr_phb_dma_capabilities_update;
+ spc->dma_init_window = spapr_phb_dma_init_window;
}
static const TypeInfo spapr_phb_info = {
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index cca45ed..6e3e17b 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -28,48 +28,36 @@ static Property spapr_phb_vfio_properties[] = {
DEFINE_PROP_END_OF_LIST(),
};
-static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp)
+static int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
{
sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
int ret;
- sPAPRTCETable *tcet;
- uint32_t liobn = svphb->phb.dma_liobn;
- if (svphb->iommugroupid == -1) {
- error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid);
- return;
- }
-
- ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid,
- VFIO_CHECK_EXTENSION,
- (void *) VFIO_SPAPR_TCE_IOMMU);
- if (ret != 1) {
- error_setg_errno(errp, -ret,
- "spapr-vfio: SPAPR extension is not supported");
- return;
- }
-
- ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as, svphb->iommugroupid,
VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
- error_setg_errno(errp, -ret,
- "spapr-vfio: get info from container failed");
- return;
+ return ret;
}
- tcet = spapr_tce_new_table(DEVICE(sphb), liobn, info.dma32_window_start,
- SPAPR_TCE_PAGE_SHIFT,
- info.dma32_window_size >> SPAPR_TCE_PAGE_SHIFT,
+ sphb->dma32_window_start = info.dma32_window_start;
+ sphb->dma32_window_size = info.dma32_window_size;
+
+ return ret;
+}
+
+static int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t liobn, uint32_t page_shift,
+ uint64_t window_size)
+{
+ uint64_t bus_offset = sphb->dma32_window_start;
+ sPAPRTCETable *tcet;
+
+ tcet = spapr_tce_new_table(DEVICE(sphb), liobn, bus_offset, page_shift,
+ window_size >> page_shift,
true);
- if (!tcet) {
- error_setg(errp, "spapr-vfio: failed to create VFIO TCE table");
- return;
- }
- /* Register default 32bit DMA window */
- memory_region_add_subregion(&sphb->iommu_root, tcet->bus_offset,
- spapr_tce_get_iommu(tcet));
+ return tcet ? 0 : -1;
}
static void spapr_phb_vfio_eeh_reenable(sPAPRPHBVFIOState *svphb)
@@ -257,7 +245,8 @@ static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
dc->props = spapr_phb_vfio_properties;
dc->reset = spapr_phb_vfio_reset;
- spc->finish_realize = spapr_phb_vfio_finish_realize;
+ spc->dma_capabilities_update = spapr_phb_vfio_dma_capabilities_update;
+ spc->dma_init_window = spapr_phb_vfio_dma_init_window;
spc->eeh_set_option = spapr_phb_vfio_eeh_set_option;
spc->eeh_get_state = spapr_phb_vfio_eeh_get_state;
spc->eeh_reset = spapr_phb_vfio_eeh_reset;
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 5322b56..b6d5719 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -48,7 +48,10 @@ typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
struct sPAPRPHBClass {
PCIHostBridgeClass parent_class;
- void (*finish_realize)(sPAPRPHBState *sphb, Error **errp);
+ int (*dma_capabilities_update)(sPAPRPHBState *sphb);
+ int (*dma_init_window)(sPAPRPHBState *sphb,
+ uint32_t liobn, uint32_t page_shift,
+ uint64_t window_size);
int (*eeh_set_option)(sPAPRPHBState *sphb, unsigned int addr, int option);
int (*eeh_get_state)(sPAPRPHBState *sphb, int *state);
int (*eeh_reset)(sPAPRPHBState *sphb, int option);
@@ -90,6 +93,9 @@ struct sPAPRPHBState {
int32_t msi_devs_num;
spapr_pci_msi_mig *msi_devs;
+ uint32_t dma32_window_start;
+ uint32_t dma32_window_size;
+
QLIST_ENTRY(sPAPRPHBState) list;
};
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 03/11] spapr_iommu: Move table allocation to helpers
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
2015-07-15 9:44 ` [Qemu-devel] [PATCH qemu v11 01/11] vmstate: Define VARRAY with VMS_ALLOC Alexey Kardashevskiy
2015-07-15 9:44 ` [Qemu-devel] [PATCH qemu v11 02/11] spapr_pci: Convert finish_realize() to dma_capabilities_update()+dma_init_window() Alexey Kardashevskiy
@ 2015-07-15 9:44 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 04/11] spapr_iommu: Introduce "enabled" state for TCE table Alexey Kardashevskiy
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:44 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
At the moment presence of vfio-pci devices on a bus affect the way
the guest view table is allocated. If there is no vfio-pci on a PHB
and the host kernel supports KVM acceleration of H_PUT_TCE, a table
is allocated in KVM. However, if there is vfio-pci and we do yet not
KVM acceleration for these, the table has to be allocated by
the userspace. At the moment the table is allocated once at boot time
but next patches will reallocate it.
This moves kvmppc_create_spapr_tce/g_malloc0 and their counterparts
to helpers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
hw/ppc/spapr_iommu.c | 58 +++++++++++++++++++++++++++++++++++-----------------
trace-events | 2 +-
2 files changed, 40 insertions(+), 20 deletions(-)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index f61504e..0cf5010 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -74,6 +74,37 @@ static IOMMUAccessFlags spapr_tce_iommu_access_flags(uint64_t tce)
}
}
+static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
+ uint32_t nb_table,
+ uint32_t page_shift,
+ int *fd,
+ bool vfio_accel)
+{
+ uint64_t *table = NULL;
+ uint64_t window_size = (uint64_t)nb_table << page_shift;
+
+ if (kvm_enabled() && !(window_size >> 32)) {
+ table = kvmppc_create_spapr_tce(liobn, window_size, fd, vfio_accel);
+ }
+
+ if (!table) {
+ *fd = -1;
+ table = g_malloc0(nb_table * sizeof(uint64_t));
+ }
+
+ trace_spapr_iommu_alloc_table(liobn, table, *fd);
+
+ return table;
+}
+
+static void spapr_tce_free_table(uint64_t *table, int fd, uint32_t nb_table)
+{
+ if (!kvm_enabled() ||
+ (kvmppc_remove_spapr_tce(table, fd, nb_table) != 0)) {
+ g_free(table);
+ }
+}
+
/* Called from RCU critical section */
static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr,
bool is_write)
@@ -140,21 +171,13 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = {
static int spapr_tce_table_realize(DeviceState *dev)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
- uint64_t window_size = (uint64_t)tcet->nb_table << tcet->page_shift;
- if (kvm_enabled() && !(window_size >> 32)) {
- tcet->table = kvmppc_create_spapr_tce(tcet->liobn,
- window_size,
- &tcet->fd,
- tcet->vfio_accel);
- }
-
- if (!tcet->table) {
- size_t table_size = tcet->nb_table * sizeof(uint64_t);
- tcet->table = g_malloc0(table_size);
- }
-
- trace_spapr_iommu_new_table(tcet->liobn, tcet, tcet->table, tcet->fd);
+ tcet->fd = -1;
+ tcet->table = spapr_tce_alloc_table(tcet->liobn,
+ tcet->nb_table,
+ tcet->page_shift,
+ &tcet->fd,
+ tcet->vfio_accel);
memory_region_init_iommu(&tcet->iommu, OBJECT(dev), &spapr_iommu_ops,
"iommu-spapr",
@@ -208,11 +231,8 @@ static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp)
QLIST_REMOVE(tcet, list);
- if (!kvm_enabled() ||
- (kvmppc_remove_spapr_tce(tcet->table, tcet->fd,
- tcet->nb_table) != 0)) {
- g_free(tcet->table);
- }
+ spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
+ tcet->fd = -1;
}
MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet)
diff --git a/trace-events b/trace-events
index d24d80a..f2e2cc0 100644
--- a/trace-events
+++ b/trace-events
@@ -1362,7 +1362,7 @@ spapr_iommu_pci_get(uint64_t liobn, uint64_t ioba, uint64_t ret, uint64_t tce) "
spapr_iommu_pci_indirect(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t iobaN, uint64_t tceN, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcelist=0x%"PRIx64" iobaN=0x%"PRIx64" tceN=0x%"PRIx64" ret=%"PRId64
spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64
spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
-spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
+spapr_iommu_alloc_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
# hw/ppc/ppc.c
ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 04/11] spapr_iommu: Introduce "enabled" state for TCE table
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (2 preceding siblings ...)
2015-07-15 9:44 ` [Qemu-devel] [PATCH qemu v11 03/11] spapr_iommu: Move table allocation to helpers Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 05/11] spapr_iommu: Remove vfio_accel flag from sPAPRTCETable Alexey Kardashevskiy
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
Currently TCE tables are created once at start and their size never
changes. We are going to change that by introducing a Dynamic DMA windows
support where DMA configuration may change during the guest execution.
This changes spapr_tce_new_table() to create an empty stub object. Only
LIOBN is assigned by the time of creation. It still will be called once
at the owner object (VIO or PHB) creation.
This introduces an "enabled" state for TCE table objects with two
helper functions - spapr_tce_table_enable()/spapr_tce_table_disable().
spapr_tce_table_enable() receives TCE table parameters and allocates
a guest view of the TCE table (in the user space or KVM).
spapr_tce_table_disable() disposes the table.
Follow up patches will disable+enable tables on reset (system reset
or DDW reset).
No visible change in behaviour is expected except the actual table
will be reallocated every reset. We might optimize this later.
The other way to implement this would be dynamically create/remove
the TCE table QOM objects but this would make migration impossible
as migration expects all QOM objects to exist at the receiver
so we have to have TCE table objects created when migration begins.
spapr_tce_table_do_enable() is separated from from spapr_tce_table_enable()
as later it will be called at the sPAPRTCETable post-migration stage when
it has all the properties set after the migration.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v10:
* check if zero size window was requested and return an error; coming
DDW patch will check for this
v9 (no changes really):
* IOMMU regions are referenced by their parent which is the PHB root region,
there is no need in explicit unparenting so ignore first note from v8 changelog.
v8:
* add missing unparent_object() to spapr_tce_table_unrealize() (parenting
is made by memory_region_init_iommu)
* tcet->iommu is alive as long as sPAPRTCETable is,
memory_region_set_size() is used to enable/disable MR
v7:
* s'tmp[64]'tmp[32]' as we need less than 64bytes and more than 16 bytes
and 32 is the closest power-of-two (just looks nices to have power-of-two
values)
* updated commit log about having spapr_tce_table_do_enable() splitted
from spapr_tce_table_enable()
v6:
* got rid of set_props()
---
hw/ppc/spapr_iommu.c | 79 +++++++++++++++++++++++++++++++++++--------------
hw/ppc/spapr_pci.c | 19 ++++++++----
hw/ppc/spapr_pci_vfio.c | 10 +++----
hw/ppc/spapr_vio.c | 9 +++---
include/hw/ppc/spapr.h | 11 +++----
5 files changed, 85 insertions(+), 43 deletions(-)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 0cf5010..fbca136 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -173,15 +173,9 @@ static int spapr_tce_table_realize(DeviceState *dev)
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
tcet->fd = -1;
- tcet->table = spapr_tce_alloc_table(tcet->liobn,
- tcet->nb_table,
- tcet->page_shift,
- &tcet->fd,
- tcet->vfio_accel);
memory_region_init_iommu(&tcet->iommu, OBJECT(dev), &spapr_iommu_ops,
- "iommu-spapr",
- (uint64_t)tcet->nb_table << tcet->page_shift);
+ "iommu-spapr", 0);
QLIST_INSERT_HEAD(&spapr_tce_tables, tcet, list);
@@ -191,14 +185,10 @@ static int spapr_tce_table_realize(DeviceState *dev)
return 0;
}
-sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
- uint64_t bus_offset,
- uint32_t page_shift,
- uint32_t nb_table,
- bool vfio_accel)
+sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn)
{
sPAPRTCETable *tcet;
- char tmp[64];
+ char tmp[32];
if (spapr_tce_find_by_liobn(liobn)) {
fprintf(stderr, "Attempted to create TCE table with duplicate"
@@ -206,16 +196,8 @@ sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
return NULL;
}
- if (!nb_table) {
- return NULL;
- }
-
tcet = SPAPR_TCE_TABLE(object_new(TYPE_SPAPR_TCE_TABLE));
tcet->liobn = liobn;
- tcet->bus_offset = bus_offset;
- tcet->page_shift = page_shift;
- tcet->nb_table = nb_table;
- tcet->vfio_accel = vfio_accel;
snprintf(tmp, sizeof(tmp), "tce-table-%x", liobn);
object_property_add_child(OBJECT(owner), tmp, OBJECT(tcet), NULL);
@@ -225,14 +207,65 @@ sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
return tcet;
}
+static void spapr_tce_table_do_enable(sPAPRTCETable *tcet)
+{
+ if (!tcet->nb_table) {
+ return;
+ }
+
+ tcet->table = spapr_tce_alloc_table(tcet->liobn,
+ tcet->nb_table,
+ tcet->page_shift,
+ &tcet->fd,
+ tcet->vfio_accel);
+
+ memory_region_set_size(&tcet->iommu,
+ (uint64_t)tcet->nb_table << tcet->page_shift);
+
+ tcet->enabled = true;
+}
+
+void spapr_tce_table_enable(sPAPRTCETable *tcet,
+ uint64_t bus_offset, uint32_t page_shift,
+ uint32_t nb_table, bool vfio_accel)
+{
+ if (tcet->enabled) {
+ return;
+ }
+
+ tcet->bus_offset = bus_offset;
+ tcet->page_shift = page_shift;
+ tcet->nb_table = nb_table;
+ tcet->vfio_accel = vfio_accel;
+
+ spapr_tce_table_do_enable(tcet);
+}
+
+void spapr_tce_table_disable(sPAPRTCETable *tcet)
+{
+ if (!tcet->enabled) {
+ return;
+ }
+
+ memory_region_set_size(&tcet->iommu, 0);
+
+ spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
+ tcet->fd = -1;
+ tcet->table = NULL;
+ tcet->enabled = false;
+ tcet->bus_offset = 0;
+ tcet->page_shift = 0;
+ tcet->nb_table = 0;
+ tcet->vfio_accel = false;
+}
+
static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
QLIST_REMOVE(tcet, list);
- spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
- tcet->fd = -1;
+ spapr_tce_table_disable(tcet);
}
MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index f302e92..4b1bde5 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -823,13 +823,16 @@ static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
uint64_t window_size)
{
uint64_t bus_offset = sphb->dma32_window_start;
- sPAPRTCETable *tcet;
+ sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
+ uint32_t nb_table = window_size >> page_shift;
- tcet = spapr_tce_new_table(DEVICE(sphb), liobn, bus_offset, page_shift,
- window_size >> page_shift,
- false);
+ if (!nb_table) {
+ return -1;
+ }
- return tcet ? 0 : -1;
+ spapr_tce_table_enable(tcet, bus_offset, page_shift, nb_table, false);
+
+ return 0;
}
/* Macros to operate with address in OF binding to PCI */
@@ -1394,6 +1397,12 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
}
}
+ tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
+ if (!tcet) {
+ error_setg(errp, "failed to create TCE table");
+ return;
+ }
+
info->dma_capabilities_update(sphb);
info->dma_init_window(sphb, sphb->dma_liobn, SPAPR_TCE_PAGE_SHIFT,
sphb->dma32_window_size);
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index 6e3e17b..69d85ab 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -51,13 +51,13 @@ static int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
uint64_t window_size)
{
uint64_t bus_offset = sphb->dma32_window_start;
- sPAPRTCETable *tcet;
+ sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
- tcet = spapr_tce_new_table(DEVICE(sphb), liobn, bus_offset, page_shift,
- window_size >> page_shift,
- true);
+ spapr_tce_table_enable(tcet, bus_offset, page_shift,
+ window_size >> page_shift,
+ true);
- return tcet ? 0 : -1;
+ return 0;
}
static void spapr_phb_vfio_eeh_reenable(sPAPRPHBVFIOState *svphb)
diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c
index c51eb8e..912fa06 100644
--- a/hw/ppc/spapr_vio.c
+++ b/hw/ppc/spapr_vio.c
@@ -479,11 +479,10 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp)
memory_region_add_subregion_overlap(&dev->mrroot, 0, &dev->mrbypass, 1);
address_space_init(&dev->as, &dev->mrroot, qdev->id);
- dev->tcet = spapr_tce_new_table(qdev, liobn,
- 0,
- SPAPR_TCE_PAGE_SHIFT,
- pc->rtce_window_size >>
- SPAPR_TCE_PAGE_SHIFT, false);
+ dev->tcet = spapr_tce_new_table(qdev, liobn);
+ spapr_tce_table_enable(dev->tcet, 0, SPAPR_TCE_PAGE_SHIFT,
+ pc->rtce_window_size >> SPAPR_TCE_PAGE_SHIFT,
+ false);
dev->tcet->vdev = dev;
memory_region_add_subregion_overlap(&dev->mrroot, 0,
spapr_tce_get_iommu(dev->tcet), 2);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 154b853..46d18e5 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -555,6 +555,7 @@ typedef struct sPAPRTCETable sPAPRTCETable;
struct sPAPRTCETable {
DeviceState parent;
+ bool enabled;
uint32_t liobn;
uint32_t nb_table;
uint64_t bus_offset;
@@ -582,11 +583,11 @@ void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
int spapr_h_cas_compose_response(sPAPRMachineState *sm,
target_ulong addr, target_ulong size,
bool cpu_update, bool memory_update);
-sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn,
- uint64_t bus_offset,
- uint32_t page_shift,
- uint32_t nb_table,
- bool vfio_accel);
+sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn);
+void spapr_tce_table_enable(sPAPRTCETable *tcet,
+ uint64_t bus_offset, uint32_t page_shift,
+ uint32_t nb_table, bool vfio_accel);
+void spapr_tce_table_disable(sPAPRTCETable *tcet);
MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet);
int spapr_dma_dt(void *fdt, int node_off, const char *propname,
uint32_t liobn, uint64_t window, uint32_t size);
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 05/11] spapr_iommu: Remove vfio_accel flag from sPAPRTCETable
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (3 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 04/11] spapr_iommu: Introduce "enabled" state for TCE table Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 06/11] spapr_iommu: Add root memory region Alexey Kardashevskiy
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
sPAPRTCETable has a vfio_accel flag which is passed to
kvmppc_create_spapr_tce() and controls whether to create a guest view
table in KVM as this depends on the host kernel ability to accelerate
H_PUT_TCE for VFIO devices. We would set this flag at the moment
when sPAPRTCETable is created in spapr_tce_new_table() and
use when the table is allocated in spapr_tce_table_realize().
Now we explicitly enable/disable DMA windows via spapr_tce_table_enable()
and spapr_tce_table_disable() and can pass this flag directly without
caching it in sPAPRTCETable.
This removes the flag. This should cause no behavioural change.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
Changes:
v8:
* new to patchset, this is cleanup
---
hw/ppc/spapr_iommu.c | 8 +++-----
include/hw/ppc/spapr.h | 1 -
2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index fbca136..1378a7a 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -207,7 +207,7 @@ sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn)
return tcet;
}
-static void spapr_tce_table_do_enable(sPAPRTCETable *tcet)
+static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel)
{
if (!tcet->nb_table) {
return;
@@ -217,7 +217,7 @@ static void spapr_tce_table_do_enable(sPAPRTCETable *tcet)
tcet->nb_table,
tcet->page_shift,
&tcet->fd,
- tcet->vfio_accel);
+ vfio_accel);
memory_region_set_size(&tcet->iommu,
(uint64_t)tcet->nb_table << tcet->page_shift);
@@ -236,9 +236,8 @@ void spapr_tce_table_enable(sPAPRTCETable *tcet,
tcet->bus_offset = bus_offset;
tcet->page_shift = page_shift;
tcet->nb_table = nb_table;
- tcet->vfio_accel = vfio_accel;
- spapr_tce_table_do_enable(tcet);
+ spapr_tce_table_do_enable(tcet, vfio_accel);
}
void spapr_tce_table_disable(sPAPRTCETable *tcet)
@@ -256,7 +255,6 @@ void spapr_tce_table_disable(sPAPRTCETable *tcet)
tcet->bus_offset = 0;
tcet->page_shift = 0;
tcet->nb_table = 0;
- tcet->vfio_accel = false;
}
static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 46d18e5..7f76fb8 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -562,7 +562,6 @@ struct sPAPRTCETable {
uint32_t page_shift;
uint64_t *table;
bool bypass;
- bool vfio_accel;
int fd;
MemoryRegion iommu;
struct VIOsPAPRDevice *vdev; /* for @bypass migration compatibility only */
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 06/11] spapr_iommu: Add root memory region
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (4 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 05/11] spapr_iommu: Remove vfio_accel flag from sPAPRTCETable Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 07/11] spapr_pci: Do complete reset of DMA config when resetting PHB Alexey Kardashevskiy
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
We are going to have multiple DMA windows at different offsets on
a PCI bus. For the sake of migration, we will have as many TCE table
objects pre-created as many windows supported.
So we need a way to map windows dynamically onto a PCI bus
when migration of a table is completed but at this stage a TCE table
object does not have access to a PHB to ask it to map a DMA window
backed by just migrated TCE table.
This adds a "root" memory region (UINT64_MAX long) to the TCE object.
This new region is mapped on a PCI bus with enabled overlapping as
there will be one root MR per TCE table, each of them mapped at 0.
The actual IOMMU memory region is a subregion of the root region and
a TCE table enables/disables this subregion and maps it at
the specific offset inside the root MR which is 1:1 mapping of
a PCI address space.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
hw/ppc/spapr_iommu.c | 13 ++++++++++---
hw/ppc/spapr_pci.c | 2 +-
include/hw/ppc/spapr.h | 2 +-
3 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 1378a7a..45c00d8 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -171,11 +171,16 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = {
static int spapr_tce_table_realize(DeviceState *dev)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
+ Object *tcetobj = OBJECT(tcet);
+ char tmp[32];
tcet->fd = -1;
- memory_region_init_iommu(&tcet->iommu, OBJECT(dev), &spapr_iommu_ops,
- "iommu-spapr", 0);
+ snprintf(tmp, sizeof(tmp), "tce-root-%x", tcet->liobn);
+ memory_region_init(&tcet->root, tcetobj, tmp, UINT64_MAX);
+
+ snprintf(tmp, sizeof(tmp), "tce-iommu-%x", tcet->liobn);
+ memory_region_init_iommu(&tcet->iommu, tcetobj, &spapr_iommu_ops, tmp, 0);
QLIST_INSERT_HEAD(&spapr_tce_tables, tcet, list);
@@ -221,6 +226,7 @@ static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel)
memory_region_set_size(&tcet->iommu,
(uint64_t)tcet->nb_table << tcet->page_shift);
+ memory_region_add_subregion(&tcet->root, tcet->bus_offset, &tcet->iommu);
tcet->enabled = true;
}
@@ -246,6 +252,7 @@ void spapr_tce_table_disable(sPAPRTCETable *tcet)
return;
}
+ memory_region_del_subregion(&tcet->root, &tcet->iommu);
memory_region_set_size(&tcet->iommu, 0);
spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
@@ -268,7 +275,7 @@ static void spapr_tce_table_unrealize(DeviceState *dev, Error **errp)
MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet)
{
- return &tcet->iommu;
+ return &tcet->root;
}
static void spapr_tce_reset(DeviceState *dev)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 4b1bde5..6fe00d4 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1411,7 +1411,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
error_setg(errp, "failed to create TCE table");
return;
}
- memory_region_add_subregion(&sphb->iommu_root, tcet->bus_offset,
+ memory_region_add_subregion(&sphb->iommu_root, 0,
spapr_tce_get_iommu(tcet));
sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 7f76fb8..d4b3d3a 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -563,7 +563,7 @@ struct sPAPRTCETable {
uint64_t *table;
bool bypass;
int fd;
- MemoryRegion iommu;
+ MemoryRegion root, iommu;
struct VIOsPAPRDevice *vdev; /* for @bypass migration compatibility only */
QLIST_ENTRY(sPAPRTCETable) list;
};
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 07/11] spapr_pci: Do complete reset of DMA config when resetting PHB
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (5 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 06/11] spapr_iommu: Add root memory region Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 08/11] spapr_vfio_pci: Remove redundant spapr-pci-vfio-host-bridge Alexey Kardashevskiy
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
On a system reset, DMA configuration has to reset too. At the moment
it clears the table content. This is enough for the single table case
but with DDW, we will also have to disable all DMA windows except
the default one. Furthermore according to sPAPR, if the guest removed
the default window and created a huge one at the same zero offset on
a PCI bus, the reset handler has to recreate the default window with
the default properties (2GB big, 4K pages).
This reworks SPAPR PHB code to disable the existing DMA window on reset
and then configure and enable the default window.
Without DDW that means that the same window will be disabled and then
enabled with no other change in behaviour.
This changes the table creation to do it in one place in PHB (VFIO PHB
just inherits the behaviour from PHB). The actual table allocation is
done from the reset handler and this is where dma_init_window() is called.
This disables all DMA windows on a PHB reset. It does not make any
difference now as there is just one DMA window but it will later with DDW
patches.
This makes spapr_phb_dma_reset() and spapr_phb_dma_remove_window() public
as these will be used in DDW RTAS "ibm,reset-pe-dma-window" and
"ibm,remove-pe-dma-window" handlers later; the handlers will reside in
hw/ppc/spapr_rtas_ddw.c.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
v9:
* as spapr_phb_vfio_reset() became not empty, this does not remove it but
adds spapr_phb_dma_reset() call
* added SPAPR_PCI_DMA_MAX_WINDOWS (was in
"spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)")
* object_child_foreach() is replaced with explicit loop over DMA windows
as later in the patchset we will be doing same loop and there the order
will matter (small windows should be enumerated first)
v7:
* s'finish_realize'dma_init_window' in the commit log
* added details (initial clause about reuse was there :) )
why exactly spapr_phb_dma_remove_window is public
---
hw/ppc/spapr_pci.c | 42 +++++++++++++++++++++++++++++++++---------
hw/ppc/spapr_pci_vfio.c | 4 ++++
include/hw/pci-host/spapr.h | 5 +++++
3 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 6fe00d4..6df3a46 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -835,6 +835,35 @@ static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
return 0;
}
+int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
+ sPAPRTCETable *tcet)
+{
+ spapr_tce_table_disable(tcet);
+
+ return 0;
+}
+
+int spapr_phb_dma_reset(sPAPRPHBState *sphb)
+{
+ int i;
+ sPAPRTCETable *tcet;
+ sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
+
+ spc->dma_capabilities_update(sphb); /* Refresh @has_vfio status */
+
+ for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+ tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(sphb->index, i));
+ if (tcet) {
+ spapr_phb_dma_remove_window(sphb, tcet);
+ }
+ }
+
+ spc->dma_init_window(sphb, SPAPR_PCI_LIOBN(sphb->index, 0),
+ SPAPR_TCE_PAGE_SHIFT, sphb->dma32_window_size);
+
+ return 0;
+}
+
/* Macros to operate with address in OF binding to PCI */
#define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p))
#define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */
@@ -1242,7 +1271,6 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
SysBusDevice *s = SYS_BUS_DEVICE(dev);
sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
PCIHostState *phb = PCI_HOST_BRIDGE(s);
- sPAPRPHBClass *info = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(s);
char *namebuf;
int i;
PCIBus *bus;
@@ -1403,14 +1431,6 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
return;
}
- info->dma_capabilities_update(sphb);
- info->dma_init_window(sphb, sphb->dma_liobn, SPAPR_TCE_PAGE_SHIFT,
- sphb->dma32_window_size);
- tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
- if (!tcet) {
- error_setg(errp, "failed to create TCE table");
- return;
- }
memory_region_add_subregion(&sphb->iommu_root, 0,
spapr_tce_get_iommu(tcet));
@@ -1430,6 +1450,10 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
static void spapr_phb_reset(DeviceState *qdev)
{
+ sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
+
+ spapr_phb_dma_reset(sphb);
+
/* Reset the IOMMU state */
object_child_foreach(OBJECT(qdev), spapr_phb_children_reset, NULL);
}
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index 69d85ab..cf5483a 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -73,6 +73,10 @@ static void spapr_phb_vfio_eeh_reenable(sPAPRPHBVFIOState *svphb)
static void spapr_phb_vfio_reset(DeviceState *qdev)
{
+ sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
+
+ spapr_phb_dma_reset(sphb);
+
/*
* The PE might be in frozen state. To reenable the EEH
* functionality on it will clean the frozen state, which
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index b6d5719..fff868e 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -123,6 +123,8 @@ struct sPAPRPHBVFIOState {
#define SPAPR_PCI_DMA32_SIZE 0x40000000
+#define SPAPR_PCI_DMA_MAX_WINDOWS 1
+
static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
{
sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -143,5 +145,8 @@ void spapr_pci_rtas_init(void);
sPAPRPHBState *spapr_pci_find_phb(sPAPRMachineState *spapr, uint64_t buid);
PCIDevice *spapr_pci_find_dev(sPAPRMachineState *spapr, uint64_t buid,
uint32_t config_addr);
+int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
+ sPAPRTCETable *tcet);
+int spapr_phb_dma_reset(sPAPRPHBState *sphb);
#endif /* __HW_SPAPR_PCI_H__ */
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 08/11] spapr_vfio_pci: Remove redundant spapr-pci-vfio-host-bridge
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (6 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 07/11] spapr_pci: Do complete reset of DMA config when resetting PHB Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 09/11] spapr_pci: Enable vfio-pci hotplug Alexey Kardashevskiy
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
sPAPRTCETable is handling 2 TCE tables already:
1) guest view of the TCE table - emulated devices use only this table;
2) hardware IOMMU table - VFIO PCI devices use it for actual work but
it does not replace 1) and it is not visible to the guest.
The initialization of this table is driven by vfio-pci device,
DMA map/unmap requests are handled via MemoryListener so there is very
little to do in spapr-pci-vfio-host-bridge.
This moves VFIO bits to the generic spapr-pci-host-bridge which allows
putting emulated and VFIO devices on the same PHB. It is still possible
to create multiple PHBs and avoid sharing PHB resouces for emulated and
VFIO devices.
If there is no VFIO-PCI device attaches, no special ioctls will be called.
If there are some VFIO-PCI devices attached, PHB may refuse to attach
another VFIO-PCI device if a VFIO container on the host kernel side
does not support container sharing.
This changes spapr-pci-host-bridge to support properties of
spapr-pci-vfio-host-bridge. This makes spapr-pci-vfio-host-bridge type
equal to spapr-pci-host-bridge except it has an additional "iommu"
property for backward compatibility reasons.
This moves PCI device lookup from spapr_phb_vfio_eeh_set_option() to
rtas_ibm_set_eeh_option() as we need to know if the device is "vfio-pci"
and decide whether to call spapr_phb_vfio_eeh_set_option() or not.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v11:
* fixed compilation for non-linux case: spapr_pci_vfio.c now always
compiles and has stubs when no CONFIG_LINUX
* replaced @has_vfio flag with a number of vfio-pci devices
v9:
* s'iommugroupid shall not be used'iommugroupid is deprecated and will be ignored'
in error log
v8:
* call spapr_phb_vfio_eeh_set_option() on vfio-pci devices only (reported by Gavin)
---
hw/ppc/Makefile.objs | 5 +-
hw/ppc/spapr_pci.c | 85 +++++++++++-------------------
hw/ppc/spapr_pci_vfio.c | 122 +++++++++++++++++++-------------------------
include/hw/pci-host/spapr.h | 25 ++++-----
4 files changed, 95 insertions(+), 142 deletions(-)
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index c8ab06e..6c06fcf 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -3,10 +3,7 @@ obj-y += ppc.o ppc_booke.o
# IBM pSeries (sPAPR)
obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
-obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
-ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
-obj-y += spapr_pci_vfio.o
-endif
+obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o spapr_rtc.o spapr_drc.o
# PowerPC 4xx boards
obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 6df3a46..25ee7d2 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -430,7 +430,6 @@ static void rtas_ibm_set_eeh_option(PowerPCCPU *cpu,
target_ulong rets)
{
sPAPRPHBState *sphb;
- sPAPRPHBClass *spc;
PCIDevice *pdev;
uint32_t addr, option;
uint64_t buid;
@@ -445,7 +444,7 @@ static void rtas_ibm_set_eeh_option(PowerPCCPU *cpu,
option = rtas_ld(args, 3);
sphb = spapr_pci_find_phb(spapr, buid);
- if (!sphb) {
+ if (!sphb || (sphb->vfio_num == 0)) {
goto param_error_exit;
}
@@ -455,12 +454,7 @@ static void rtas_ibm_set_eeh_option(PowerPCCPU *cpu,
goto param_error_exit;
}
- spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- if (!spc->eeh_set_option) {
- goto param_error_exit;
- }
-
- ret = spc->eeh_set_option(sphb, addr, option);
+ ret = spapr_phb_vfio_eeh_set_option(sphb, pdev, option);
rtas_st(rets, 0, ret);
return;
@@ -475,7 +469,6 @@ static void rtas_ibm_get_config_addr_info2(PowerPCCPU *cpu,
target_ulong rets)
{
sPAPRPHBState *sphb;
- sPAPRPHBClass *spc;
PCIDevice *pdev;
uint32_t addr, option;
uint64_t buid;
@@ -486,12 +479,7 @@ static void rtas_ibm_get_config_addr_info2(PowerPCCPU *cpu,
buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
sphb = spapr_pci_find_phb(spapr, buid);
- if (!sphb) {
- goto param_error_exit;
- }
-
- spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- if (!spc->eeh_set_option) {
+ if (!sphb || (sphb->vfio_num == 0)) {
goto param_error_exit;
}
@@ -531,7 +519,6 @@ static void rtas_ibm_read_slot_reset_state2(PowerPCCPU *cpu,
target_ulong rets)
{
sPAPRPHBState *sphb;
- sPAPRPHBClass *spc;
uint64_t buid;
int state, ret;
@@ -541,16 +528,11 @@ static void rtas_ibm_read_slot_reset_state2(PowerPCCPU *cpu,
buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
sphb = spapr_pci_find_phb(spapr, buid);
- if (!sphb) {
+ if (!sphb || (sphb->vfio_num == 0)) {
goto param_error_exit;
}
- spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- if (!spc->eeh_get_state) {
- goto param_error_exit;
- }
-
- ret = spc->eeh_get_state(sphb, &state);
+ ret = spapr_phb_vfio_eeh_get_state(sphb, &state);
rtas_st(rets, 0, ret);
if (ret != RTAS_OUT_SUCCESS) {
return;
@@ -575,7 +557,6 @@ static void rtas_ibm_set_slot_reset(PowerPCCPU *cpu,
target_ulong rets)
{
sPAPRPHBState *sphb;
- sPAPRPHBClass *spc;
uint32_t option;
uint64_t buid;
int ret;
@@ -587,16 +568,11 @@ static void rtas_ibm_set_slot_reset(PowerPCCPU *cpu,
buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
option = rtas_ld(args, 3);
sphb = spapr_pci_find_phb(spapr, buid);
- if (!sphb) {
+ if (!sphb || (sphb->vfio_num == 0)) {
goto param_error_exit;
}
- spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- if (!spc->eeh_reset) {
- goto param_error_exit;
- }
-
- ret = spc->eeh_reset(sphb, option);
+ ret = spapr_phb_vfio_eeh_reset(sphb, option);
rtas_st(rets, 0, ret);
return;
@@ -611,7 +587,6 @@ static void rtas_ibm_configure_pe(PowerPCCPU *cpu,
target_ulong rets)
{
sPAPRPHBState *sphb;
- sPAPRPHBClass *spc;
uint64_t buid;
int ret;
@@ -621,16 +596,11 @@ static void rtas_ibm_configure_pe(PowerPCCPU *cpu,
buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
sphb = spapr_pci_find_phb(spapr, buid);
- if (!sphb) {
+ if (!sphb || (sphb->vfio_num == 0)) {
goto param_error_exit;
}
- spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- if (!spc->eeh_configure) {
- goto param_error_exit;
- }
-
- ret = spc->eeh_configure(sphb);
+ ret = spapr_phb_vfio_eeh_configure(sphb);
rtas_st(rets, 0, ret);
return;
@@ -646,7 +616,6 @@ static void rtas_ibm_slot_error_detail(PowerPCCPU *cpu,
target_ulong rets)
{
sPAPRPHBState *sphb;
- sPAPRPHBClass *spc;
int option;
uint64_t buid;
@@ -656,12 +625,7 @@ static void rtas_ibm_slot_error_detail(PowerPCCPU *cpu,
buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
sphb = spapr_pci_find_phb(spapr, buid);
- if (!sphb) {
- goto param_error_exit;
- }
-
- spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- if (!spc->eeh_set_option) {
+ if (!sphb || (sphb->vfio_num == 0)) {
goto param_error_exit;
}
@@ -815,6 +779,10 @@ static int spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb)
sphb->dma32_window_start = 0;
sphb->dma32_window_size = SPAPR_PCI_DMA32_SIZE;
+ if (sphb->vfio_num > 0) {
+ spapr_phb_vfio_dma_capabilities_update(sphb);
+ }
+
return 0;
}
@@ -830,7 +798,8 @@ static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
return -1;
}
- spapr_tce_table_enable(tcet, bus_offset, page_shift, nb_table, false);
+ spapr_tce_table_enable(tcet, bus_offset, page_shift, nb_table,
+ sphb->vfio_num > 0);
return 0;
}
@@ -847,9 +816,8 @@ int spapr_phb_dma_reset(sPAPRPHBState *sphb)
{
int i;
sPAPRTCETable *tcet;
- sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
- spc->dma_capabilities_update(sphb); /* Refresh @has_vfio status */
+ spapr_phb_dma_capabilities_update(sphb);
for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(sphb->index, i));
@@ -858,8 +826,8 @@ int spapr_phb_dma_reset(sPAPRPHBState *sphb)
}
}
- spc->dma_init_window(sphb, SPAPR_PCI_LIOBN(sphb->index, 0),
- SPAPR_TCE_PAGE_SHIFT, sphb->dma32_window_size);
+ spapr_phb_dma_init_window(sphb, SPAPR_PCI_LIOBN(sphb->index, 0),
+ SPAPR_TCE_PAGE_SHIFT, sphb->dma32_window_size);
return 0;
}
@@ -1277,6 +1245,11 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
uint64_t msi_window_size = 4096;
sPAPRTCETable *tcet;
+ if ((sphb->iommugroupid != -1) &&
+ object_dynamic_cast(OBJECT(sphb), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE)) {
+ error_report("Warning: iommugroupid is deprecated and will be ignored");
+ }
+
if (sphb->index != (uint32_t)-1) {
hwaddr windows_base;
@@ -1452,6 +1425,9 @@ static void spapr_phb_reset(DeviceState *qdev)
{
sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
+ if (sphb->vfio_num > 0) {
+ spapr_phb_vfio_eeh_reenable(sphb);
+ }
spapr_phb_dma_reset(sphb);
/* Reset the IOMMU state */
@@ -1576,7 +1552,6 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
{
PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
DeviceClass *dc = DEVICE_CLASS(klass);
- sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
HotplugHandlerClass *hp = HOTPLUG_HANDLER_CLASS(klass);
hc->root_bus_path = spapr_phb_root_bus_path;
@@ -1588,8 +1563,6 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
dc->cannot_instantiate_with_device_add_yet = false;
hp->plug = spapr_phb_hot_plug_child;
hp->unplug = spapr_phb_hot_unplug_child;
- spc->dma_capabilities_update = spapr_phb_dma_capabilities_update;
- spc->dma_init_window = spapr_phb_dma_init_window;
}
static const TypeInfo spapr_phb_info = {
@@ -1635,6 +1608,10 @@ static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev,
return;
}
+ if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) {
+ ++p->sphb->vfio_num;
+ }
+
if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) !=
PCI_HEADER_TYPE_BRIDGE)) {
return;
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index cf5483a..999ac3a 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -20,21 +20,22 @@
#include "hw/ppc/spapr.h"
#include "hw/pci-host/spapr.h"
#include "hw/pci/msix.h"
-#include "linux/vfio.h"
#include "hw/vfio/vfio.h"
+#ifdef CONFIG_LINUX
+#include "linux/vfio.h"
+
static Property spapr_phb_vfio_properties[] = {
- DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1),
+ DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
DEFINE_PROP_END_OF_LIST(),
};
-static int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
+int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
{
- sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
int ret;
- ret = vfio_container_ioctl(&sphb->iommu_as, svphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
return ret;
@@ -46,50 +47,27 @@ static int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
return ret;
}
-static int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
- uint32_t liobn, uint32_t page_shift,
- uint64_t window_size)
-{
- uint64_t bus_offset = sphb->dma32_window_start;
- sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
- spapr_tce_table_enable(tcet, bus_offset, page_shift,
- window_size >> page_shift,
- true);
-
- return 0;
-}
-
-static void spapr_phb_vfio_eeh_reenable(sPAPRPHBVFIOState *svphb)
+void spapr_phb_vfio_eeh_reenable(sPAPRPHBState *sphb)
{
struct vfio_eeh_pe_op op = {
.argsz = sizeof(op),
.op = VFIO_EEH_PE_ENABLE
};
- vfio_container_ioctl(&svphb->phb.iommu_as,
- svphb->iommugroupid, VFIO_EEH_PE_OP, &op);
-}
-
-static void spapr_phb_vfio_reset(DeviceState *qdev)
-{
- sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
-
- spapr_phb_dma_reset(sphb);
-
/*
* The PE might be in frozen state. To reenable the EEH
* functionality on it will clean the frozen state, which
* ensures that the contained PCI devices will work properly
* after reboot.
*/
- spapr_phb_vfio_eeh_reenable(SPAPR_PCI_VFIO_HOST_BRIDGE(qdev));
+ vfio_container_ioctl(&sphb->iommu_as,
+ sphb->iommugroupid, VFIO_EEH_PE_OP, &op);
}
-static int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
- unsigned int addr, int option)
+int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
+ PCIDevice *pdev, int option)
{
- sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
struct vfio_eeh_pe_op op = { .argsz = sizeof(op) };
int ret;
@@ -97,25 +75,9 @@ static int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
case RTAS_EEH_DISABLE:
op.op = VFIO_EEH_PE_DISABLE;
break;
- case RTAS_EEH_ENABLE: {
- PCIHostState *phb;
- PCIDevice *pdev;
-
- /*
- * The EEH functionality is enabled on basis of PCI device,
- * instead of PE. We need check the validity of the PCI
- * device address.
- */
- phb = PCI_HOST_BRIDGE(sphb);
- pdev = pci_find_device(phb->bus,
- (addr >> 16) & 0xFF, (addr >> 8) & 0xFF);
- if (!pdev) {
- return RTAS_OUT_PARAM_ERROR;
- }
-
+ case RTAS_EEH_ENABLE:
op.op = VFIO_EEH_PE_ENABLE;
break;
- }
case RTAS_EEH_THAW_IO:
op.op = VFIO_EEH_PE_UNFREEZE_IO;
break;
@@ -126,7 +88,7 @@ static int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
return RTAS_OUT_PARAM_ERROR;
}
- ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_HW_ERROR;
@@ -135,14 +97,13 @@ static int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
return RTAS_OUT_SUCCESS;
}
-static int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state)
+int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state)
{
- sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
struct vfio_eeh_pe_op op = { .argsz = sizeof(op) };
int ret;
op.op = VFIO_EEH_PE_GET_STATE;
- ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_PARAM_ERROR;
@@ -195,9 +156,8 @@ static void spapr_phb_vfio_eeh_pre_reset(sPAPRPHBState *sphb)
pci_for_each_bus(phb->bus, spapr_phb_vfio_eeh_clear_bus_msix, NULL);
}
-static int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option)
+int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option)
{
- sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
struct vfio_eeh_pe_op op = { .argsz = sizeof(op) };
int ret;
@@ -217,7 +177,7 @@ static int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option)
return RTAS_OUT_PARAM_ERROR;
}
- ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_HW_ERROR;
@@ -226,14 +186,13 @@ static int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option)
return RTAS_OUT_SUCCESS;
}
-static int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
+int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
{
- sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
struct vfio_eeh_pe_op op = { .argsz = sizeof(op) };
int ret;
op.op = VFIO_EEH_PE_CONFIGURE;
- ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_PARAM_ERROR;
@@ -245,22 +204,14 @@ static int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
- sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass);
dc->props = spapr_phb_vfio_properties;
- dc->reset = spapr_phb_vfio_reset;
- spc->dma_capabilities_update = spapr_phb_vfio_dma_capabilities_update;
- spc->dma_init_window = spapr_phb_vfio_dma_init_window;
- spc->eeh_set_option = spapr_phb_vfio_eeh_set_option;
- spc->eeh_get_state = spapr_phb_vfio_eeh_get_state;
- spc->eeh_reset = spapr_phb_vfio_eeh_reset;
- spc->eeh_configure = spapr_phb_vfio_eeh_configure;
}
static const TypeInfo spapr_phb_vfio_info = {
.name = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE,
.parent = TYPE_SPAPR_PCI_HOST_BRIDGE,
- .instance_size = sizeof(sPAPRPHBVFIOState),
+ .instance_size = sizeof(sPAPRPHBState),
.class_init = spapr_phb_vfio_class_init,
.class_size = sizeof(sPAPRPHBClass),
};
@@ -271,3 +222,36 @@ static void spapr_pci_vfio_register_types(void)
}
type_init(spapr_pci_vfio_register_types)
+
+#else /* !CONFIG_LINUX */
+
+int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
+{
+ return -1;
+}
+
+int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
+ PCIDevice *pdev, int option)
+{
+ return RTAS_OUT_HW_ERROR;
+}
+
+int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state)
+{
+ return RTAS_OUT_HW_ERROR;
+}
+
+int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option)
+{
+ return RTAS_OUT_HW_ERROR;
+}
+
+int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
+{
+ return RTAS_OUT_HW_ERROR;
+}
+
+void spapr_phb_vfio_eeh_reenable(sPAPRPHBState *sphb)
+{
+}
+#endif
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index fff868e..2819e96 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -47,15 +47,6 @@ typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState;
struct sPAPRPHBClass {
PCIHostBridgeClass parent_class;
-
- int (*dma_capabilities_update)(sPAPRPHBState *sphb);
- int (*dma_init_window)(sPAPRPHBState *sphb,
- uint32_t liobn, uint32_t page_shift,
- uint64_t window_size);
- int (*eeh_set_option)(sPAPRPHBState *sphb, unsigned int addr, int option);
- int (*eeh_get_state)(sPAPRPHBState *sphb, int *state);
- int (*eeh_reset)(sPAPRPHBState *sphb, int option);
- int (*eeh_configure)(sPAPRPHBState *sphb);
};
typedef struct spapr_pci_msi {
@@ -95,16 +86,12 @@ struct sPAPRPHBState {
uint32_t dma32_window_start;
uint32_t dma32_window_size;
+ unsigned vfio_num;
+ int32_t iommugroupid; /* obsolete */
QLIST_ENTRY(sPAPRPHBState) list;
};
-struct sPAPRPHBVFIOState {
- sPAPRPHBState phb;
-
- int32_t iommugroupid;
-};
-
#define SPAPR_PCI_MAX_INDEX 255
#define SPAPR_PCI_BASE_BUID 0x800000020000000ULL
@@ -149,4 +136,12 @@ int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
sPAPRTCETable *tcet);
int spapr_phb_dma_reset(sPAPRPHBState *sphb);
+int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb);
+int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
+ PCIDevice *pdev, int option);
+int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state);
+int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option);
+int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb);
+void spapr_phb_vfio_eeh_reenable(sPAPRPHBState *sphb);
+
#endif /* __HW_SPAPR_PCI_H__ */
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 09/11] spapr_pci: Enable vfio-pci hotplug
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (7 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 08/11] spapr_vfio_pci: Remove redundant spapr-pci-vfio-host-bridge Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 10/11] spapr_pci_vfio: Enable multiple groups per container Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 11/11] spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW) Alexey Kardashevskiy
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
sPAPR IOMMU is managing two copies of an TCE table:
1) a guest view of the table - this is what emulated devices use and
this is where H_GET_TCE reads from;
2) a hardware TCE table - only present if there is at least one vfio-pci
device on a PHB; it is updated via a memory listener on a PHB address
space which forwards map/unmap requests to vfio-pci IOMMU host driver.
At the moment presence of vfio-pci devices on a bus affect the way
the guest view table is allocated. If there is no vfio-pci on a PHB
and the host kernel supports KVM acceleration of H_PUT_TCE, a table
is allocated in KVM. However, if there is vfio-pci and we do yet not
support KVM acceleration for these, the table has to be allocated
by the userspace.
When vfio-pci device is hotplugged and there were no vfio-pci devices
already, the guest view table could have been allocated by KVM which
means that H_PUT_TCE is handled by the host kernel and since we
do not support vfio-pci in KVM, the hardware table will not be updated.
This reallocates the guest view table in QEMU if the first vfio-pci
device has just been plugged. spapr_tce_realloc() handles this. When
last vfio-pci device is unplugged, this tries reallocating the TCE table
in KVM.
This replays all the mappings to make sure that the tables are in sync.
This will not have a visible effect though as for a new device
the guest kernel will allocate-and-map new addresses and therefore
existing mappings from emulated devices will not be used by vfio-pci
devices.
This adds calls to spapr_phb_dma_capabilities_update() in PCI hotplug
hooks.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v11:
* removed one level of indent in spapr_phb_hotplug_dma_sync
* removed RCU handler and used @vfio_num counter instead as the actual
presense of vfio-pci devices matter, not containers
* s/spapr_tce_realloc_userspace/spapr_tce_realloc/ and support reallocation
to KVM space in order to reenable acceleration; this is also necessary
for in-kernel acceleration for VFIO - we will drop IOMMU group references
by reallocating the KVM table
v10:
* removed unnecessary memory_region_del_subregion() and
memory_region_add_subregion() as
"vfio: Unregister IOMMU notifiers when container is destroyed" removes
notifiers in a more correct way
v9:
* spapr_phb_hotplug_dma_sync() enumerates TCE tables explicitely rather than
via object_child_foreach()
* spapr_phb_hotplug_dma_sync() does memory_region_del_subregion() +
memory_region_add_subregion() as otherwise vfio_listener_region_del() is not
called and we end up with vfio_iommu_map_notify registered twice (comments welcome!)
if we do hotplug+hotunplug+hotplug of the same device.
* moved spapr_phb_hotplug_dma_sync() on unplug event to rcu as before calling
spapr_phb_hotplug_dma_sync(), we need VFIO to release the container, otherwise
spapr_phb_dma_capabilities_update() will decide that the PHB still has VFIO device.
Actual VFIO PCI device release happens from rcu and since we add ours later,
it gets executed later and we are good.
---
hw/ppc/spapr_iommu.c | 71 +++++++++++++++++++++++++++++++++++++++++++--
hw/ppc/spapr_pci.c | 60 ++++++++++++++++++++++++++++++++++++++
include/hw/pci-host/spapr.h | 1 +
include/hw/ppc/spapr.h | 3 ++
trace-events | 2 ++
5 files changed, 134 insertions(+), 3 deletions(-)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 45c00d8..54ab727 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -78,12 +78,13 @@ static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
uint32_t nb_table,
uint32_t page_shift,
int *fd,
- bool vfio_accel)
+ bool vfio_accel,
+ bool force_userspace)
{
uint64_t *table = NULL;
uint64_t window_size = (uint64_t)nb_table << page_shift;
- if (kvm_enabled() && !(window_size >> 32)) {
+ if (kvm_enabled() && !force_userspace && !(window_size >> 32)) {
table = kvmppc_create_spapr_tce(liobn, window_size, fd, vfio_accel);
}
@@ -222,7 +223,8 @@ static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel)
tcet->nb_table,
tcet->page_shift,
&tcet->fd,
- vfio_accel);
+ vfio_accel,
+ false);
memory_region_set_size(&tcet->iommu,
(uint64_t)tcet->nb_table << tcet->page_shift);
@@ -495,6 +497,69 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
return 0;
}
+static int spapr_tce_do_replay(sPAPRTCETable *tcet, uint64_t *table)
+{
+ target_ulong ioba = tcet->bus_offset, pgsz = (1ULL << tcet->page_shift);
+ long i, ret = 0;
+
+ for (i = 0; i < tcet->nb_table; ++i, ioba += pgsz) {
+ ret = put_tce_emu(tcet, ioba, table[i]);
+ if (ret) {
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int spapr_tce_replay(sPAPRTCETable *tcet)
+{
+ return spapr_tce_do_replay(tcet, tcet->table);
+}
+
+int spapr_tce_realloc(sPAPRTCETable *tcet, bool vfio_accel,
+ bool force_userspace)
+{
+ int ret, oldfd;
+ uint64_t *oldtable;
+
+ if (force_userspace) {
+ oldtable = tcet->table;
+ oldfd = tcet->fd;
+ } else {
+ unsigned long cb = tcet->nb_table * sizeof(uint64_t);
+ /*
+ * We might be trying to reallocate KVM table.
+ * KVM_CREATE_SPAPR_TCE handler checks for LIOBN and fails if
+ * is registered. Store KVM table locally and destroy the KVM table.
+ */
+ oldtable = g_malloc0(cb);
+ oldfd = -1;
+ memcpy(oldtable, tcet->table, cb);
+ spapr_tce_free_table(tcet->table, tcet->fd, tcet->nb_table);
+ }
+
+ tcet->table = spapr_tce_alloc_table(tcet->liobn,
+ tcet->nb_table,
+ tcet->page_shift,
+ &tcet->fd,
+ vfio_accel,
+ force_userspace);
+ if (!tcet->table) {
+ return -ENOMEM;
+ }
+
+ ret = spapr_tce_do_replay(tcet, oldtable);
+
+ if (force_userspace) {
+ spapr_tce_free_table(oldtable, oldfd, tcet->nb_table);
+ } else {
+ g_free(oldtable);
+ }
+
+ return ret;
+}
+
int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
sPAPRTCETable *tcet)
{
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 25ee7d2..98d93fa 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -832,6 +832,43 @@ int spapr_phb_dma_reset(sPAPRPHBState *sphb)
return 0;
}
+static int spapr_phb_hotplug_dma_sync(sPAPRPHBState *sphb)
+{
+ int ret = 0, i;
+ sPAPRTCETable *tcet;
+
+ spapr_phb_dma_capabilities_update(sphb);
+
+ for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+ tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(sphb->index, i));
+ if (!tcet || !tcet->enabled) {
+ continue;
+ }
+ if ((tcet->fd >= 0) && (sphb->vfio_num > 0)) {
+ /*
+ * We got first vfio-pci device on accelerated table.
+ * VFIO acceleration is not possible.
+ * Reallocate table in userspace and replay mappings.
+ */
+ ret = spapr_tce_realloc(tcet, true, true);
+ trace_spapr_pci_dma_realloc_update(tcet->liobn, ret);
+ } else if ((tcet->fd < 0) && (sphb->vfio_num > 0)) {
+ /* There was no acceleration, so just replay mappings. */
+ ret = spapr_tce_replay(tcet);
+ trace_spapr_pci_dma_update(tcet->liobn, ret);
+ } else if ((tcet->fd < 0) && (sphb->vfio_num == 0)) {
+ /* Last vfio-pci device is gone, try enabling in-kernel table */
+ ret = spapr_tce_realloc(tcet, false, false);
+ trace_spapr_pci_dma_update(tcet->liobn, ret);
+ }
+ if (ret) {
+ break;
+ }
+ }
+
+ return ret;
+}
+
/* Macros to operate with address in OF binding to PCI */
#define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p))
#define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */
@@ -1111,6 +1148,14 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
error_setg(errp, "Failed to create pci child device tree node");
goto out;
}
+ if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) {
+ unsigned vfio_num = phb->vfio_num;
+
+ ++phb->vfio_num;
+ if (vfio_num == 0) {
+ spapr_phb_hotplug_dma_sync(phb);
+ }
+ }
}
drck->attach(drc, DEVICE(pdev),
@@ -1123,6 +1168,9 @@ out:
static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque)
{
+ bool do_sync = false;
+ sPAPRPHBState *phb = opaque;
+
/* some version guests do not wait for completion of a device
* cleanup (generally done asynchronously by the kernel) before
* signaling to QEMU that the device is safe, but instead sleep
@@ -1134,7 +1182,19 @@ static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque)
* an 'idle' state, as the device cleanup code expects.
*/
pci_device_reset(PCI_DEVICE(dev));
+
+ /* Check if it the last vfio-pci device on a PHB while it is still alive */
+ if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
+ --phb->vfio_num;
+ do_sync = phb->vfio_num == 0;
+ }
+
object_unparent(OBJECT(dev));
+
+ /* Update DMA config, the last vfio-pci might or might not be gone by now */
+ if (do_sync) {
+ spapr_phb_hotplug_dma_sync(phb);
+ }
}
static void spapr_phb_remove_pci_device(sPAPRDRConnector *drc,
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 2819e96..02708d9 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -61,6 +61,7 @@ typedef struct spapr_pci_msi_mig {
struct sPAPRPHBState {
PCIHostState parent_obj;
+ struct rcu_head rcu;
uint32_t index;
uint64_t buid;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index d4b3d3a..8553592 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -592,6 +592,9 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
uint32_t liobn, uint64_t window, uint32_t size);
int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
sPAPRTCETable *tcet);
+int spapr_tce_replay(sPAPRTCETable *tcet);
+int spapr_tce_realloc(sPAPRTCETable *tcet, bool vfio_accel,
+ bool force_userspace);
void spapr_pci_switch_vga(bool big_endian);
void spapr_hotplug_req_add_event(sPAPRDRConnector *drc);
void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc);
diff --git a/trace-events b/trace-events
index f2e2cc0..9d133d1 100644
--- a/trace-events
+++ b/trace-events
@@ -1300,6 +1300,8 @@ spapr_pci_rtas_ibm_query_interrupt_source_number(unsigned ioa, unsigned intr) "q
spapr_pci_msi_write(uint64_t addr, uint64_t data, uint32_t dt_irq) "@%"PRIx64"<=%"PRIx64" IRQ %u"
spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ %u"
spapr_pci_msi_retry(unsigned config_addr, unsigned req_num, unsigned max_irqs) "Guest device at %x asked %u, have only %u"
+spapr_pci_dma_update(uint64_t liobn, long ret) "liobn=%"PRIx64" ret=%ld"
+spapr_pci_dma_realloc_update(uint64_t liobn, long ret) "liobn=%"PRIx64" tcet=%ld"
# hw/pci/pci.c
pci_update_mappings_del(void *d, uint32_t bus, uint32_t func, uint32_t slot, int bar, uint64_t addr, uint64_t size) "d=%p %02x:%02x.%x %d,%#"PRIx64"+%#"PRIx64
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 10/11] spapr_pci_vfio: Enable multiple groups per container
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (8 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 09/11] spapr_pci: Enable vfio-pci hotplug Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 11/11] spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW) Alexey Kardashevskiy
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
This enables multiple IOMMU groups in one VFIO container which means
that multiple devices from different groups can share the same IOMMU
table (or tables if DDW).
This removes a group id from vfio_container_ioctl(). The kernel support
is required for this; if the host kernel does not have the support,
it will allow only one group per container. The PHB's "iommuid" property
is ignored. The ioctl is called for every container attached to
the address space. At the moment there is just one container anyway.
If there is no container attached to the address space,
vfio_container_do_ioctl() returns -1.
This removes casts to sPAPRPHBVFIOState as none of sPAPRPHBVFIOState
members is accessed here.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
hw/ppc/spapr_pci_vfio.c | 17 ++++++-----------
hw/vfio/common.c | 21 ++++++---------------
include/hw/vfio/vfio.h | 3 +--
3 files changed, 13 insertions(+), 28 deletions(-)
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index 999ac3a..b744c65 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -35,7 +35,7 @@ int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) };
int ret;
- ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
+ ret = vfio_container_ioctl(&sphb->iommu_as,
VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
return ret;
@@ -61,8 +61,7 @@ void spapr_phb_vfio_eeh_reenable(sPAPRPHBState *sphb)
* ensures that the contained PCI devices will work properly
* after reboot.
*/
- vfio_container_ioctl(&sphb->iommu_as,
- sphb->iommugroupid, VFIO_EEH_PE_OP, &op);
+ vfio_container_ioctl(&sphb->iommu_as, VFIO_EEH_PE_OP, &op);
}
int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
@@ -88,8 +87,7 @@ int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
return RTAS_OUT_PARAM_ERROR;
}
- ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
- VFIO_EEH_PE_OP, &op);
+ ret = vfio_container_ioctl(&sphb->iommu_as, VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_HW_ERROR;
}
@@ -103,8 +101,7 @@ int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state)
int ret;
op.op = VFIO_EEH_PE_GET_STATE;
- ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
- VFIO_EEH_PE_OP, &op);
+ ret = vfio_container_ioctl(&sphb->iommu_as, VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_PARAM_ERROR;
}
@@ -177,8 +174,7 @@ int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option)
return RTAS_OUT_PARAM_ERROR;
}
- ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
- VFIO_EEH_PE_OP, &op);
+ ret = vfio_container_ioctl(&sphb->iommu_as, VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_HW_ERROR;
}
@@ -192,8 +188,7 @@ int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
int ret;
op.op = VFIO_EEH_PE_CONFIGURE;
- ret = vfio_container_ioctl(&sphb->iommu_as, sphb->iommugroupid,
- VFIO_EEH_PE_OP, &op);
+ ret = vfio_container_ioctl(&sphb->iommu_as, VFIO_EEH_PE_OP, &op);
if (ret < 0) {
return RTAS_OUT_PARAM_ERROR;
}
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 85ee9b0..64e0a54 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -926,35 +926,26 @@ void vfio_put_base_device(VFIODevice *vbasedev)
close(vbasedev->fd);
}
-static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid,
+static int vfio_container_do_ioctl(AddressSpace *as,
int req, void *param)
{
- VFIOGroup *group;
VFIOContainer *container;
int ret = -1;
+ VFIOAddressSpace *space = vfio_get_address_space(as);
- group = vfio_get_group(groupid, as);
- if (!group) {
- error_report("vfio: group %d not registered", groupid);
- return ret;
- }
-
- container = group->container;
- if (group->container) {
+ QLIST_FOREACH(container, &space->containers, next) {
ret = ioctl(container->fd, req, param);
if (ret < 0) {
error_report("vfio: failed to ioctl %d to container: ret=%d, %s",
_IOC_NR(req) - VFIO_BASE, ret, strerror(errno));
+ return -errno;
}
}
- vfio_put_group(group);
-
return ret;
}
-int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
- int req, void *param)
+int vfio_container_ioctl(AddressSpace *as, int req, void *param)
{
/* We allow only certain ioctls to the container */
switch (req) {
@@ -968,5 +959,5 @@ int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
return -1;
}
- return vfio_container_do_ioctl(as, groupid, req, param);
+ return vfio_container_do_ioctl(as, req, param);
}
diff --git a/include/hw/vfio/vfio.h b/include/hw/vfio/vfio.h
index 0b26cd8..e076c14 100644
--- a/include/hw/vfio/vfio.h
+++ b/include/hw/vfio/vfio.h
@@ -3,7 +3,6 @@
#include "qemu/typedefs.h"
-extern int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
- int req, void *param);
+extern int vfio_container_ioctl(AddressSpace *as, int req, void *param);
#endif
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH qemu v11 11/11] spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW)
2015-07-15 9:44 [Qemu-devel] [PATCH qemu v11 00/11] spapr: vfio: Enable Dynamic DMA windows (DDW) Alexey Kardashevskiy
` (9 preceding siblings ...)
2015-07-15 9:45 ` [Qemu-devel] [PATCH qemu v11 10/11] spapr_pci_vfio: Enable multiple groups per container Alexey Kardashevskiy
@ 2015-07-15 9:45 ` Alexey Kardashevskiy
10 siblings, 0 replies; 12+ messages in thread
From: Alexey Kardashevskiy @ 2015-07-15 9:45 UTC (permalink / raw)
To: qemu-devel
Cc: Laurent Vivier, Thomas Huth, Michael Roth, Alexey Kardashevskiy,
Alexander Graf, Gavin Shan, Alex Williamson, qemu-ppc,
David Gibson
This adds support for Dynamic DMA Windows (DDW) option defined by
the SPAPR specification which allows to have additional DMA window(s)
This implements DDW for emulated and VFIO devices. As all TCE root regions
are mapped at 0 and 64bit long (and actual tables are child regions),
this replaces memory_region_add_subregion() with _overlap() to make
QEMU memory API happy.
This reserves RTAS token numbers for DDW calls.
This implements helpers to interact with VFIO kernel interface.
This changes the TCE table migration descriptor to support dynamic
tables as from now on, PHB will create as many stub TCE table objects
as PHB can possibly support but not all of them might be initialized at
the time of migration because DDW might or might not be requested by
the guest.
The "ddw" property is enabled by default on a PHB but for compatibility
the pseries-2.3 machine and older disable it.
This implements DDW for VFIO. The host kernel support is required.
This adds a "levels" property to PHB to control the number of levels
in the actual TCE table allocated by the host kernel, 0 is the default
value to tell QEMU to calculate the correct value. Current hardware
supports up to 5 levels.
The existing linux guests try creating one additional huge DMA window
with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
the guest switches to dma_direct_ops and never calls TCE hypercalls
(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
and not waste time on map/unmap later. This adds a "dma64_win_addr"
property which is a bus address for the 64bit window and by default
set to 0x800.0000.0000.0000 as this is what the modern POWER8 hardware
uses and this allows having emulated and VFIO devices on the same bus.
This adds 4 RTAS handlers:
* ibm,query-pe-dma-window
* ibm,create-pe-dma-window
* ibm,remove-pe-dma-window
* ibm,reset-pe-dma-window
These are registered from type_init() callback.
These RTAS handlers are implemented in a separate file to avoid polluting
spapr_iommu.c with PCI.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v11:
* added a comment about windows removal in spapr_phb_hotplug_dma_sync
* do not remove windows on unplug - the host kernel takes care of this anyway
* s/unsigned long long/uint64_t/ in trace-events
* added sanity check for window size not being smaller than page size
(i.e. actual window size is not zero)
* fixed compile for non-linux environment (mingw32)
v10:
* added dma64_win_addr property to PHB
* removed redundand check for "!migtable" in spapr_tce_table_post_load()
v9:
* fixed default 64bit window start (from mdroth)
* fixed type cast in dma window update code (from mdroth)
* spapr_phb_dma_update() now can fail and cause hotplug failure if
hardware TCE table cannot be mapped to the same bus address as the emulated one
v7:
* fixed uninitialized variables
v6:
* rework as there is no more special device for VFIO PHB
v5:
* total rework
* enabled for machines >2.3
* fixed migration
* merged rtas handlers here
v4:
* reset handler is back in generalized form
v3:
* removed reset
* windows_num is now 1 or bigger rather than 0-based value and it is only
changed in PHB code, not in RTAS
* added page mask check in create()
* added SPAPR_PCI_DDW_MAX_WINDOWS to track how many windows are already
created
v2:
* tested on hacked emulated E1000
* implemented DDW reset on the PHB reset
* spapr_pci_ddw_remove/spapr_pci_ddw_reset are public for reuse by VFIO
---
hw/ppc/Makefile.objs | 1 +
hw/ppc/spapr.c | 5 +
hw/ppc/spapr_iommu.c | 32 ++++-
hw/ppc/spapr_pci.c | 107 ++++++++++++++--
hw/ppc/spapr_pci_vfio.c | 102 +++++++++++++++
hw/ppc/spapr_rtas_ddw.c | 304 ++++++++++++++++++++++++++++++++++++++++++++
hw/vfio/common.c | 2 +
include/hw/pci-host/spapr.h | 21 ++-
include/hw/ppc/spapr.h | 17 ++-
trace-events | 6 +
10 files changed, 581 insertions(+), 16 deletions(-)
create mode 100644 hw/ppc/spapr_rtas_ddw.c
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 6c06fcf..475d016 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o
obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_pci_vfio.o spapr_rtc.o spapr_drc.o
+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
# PowerPC 4xx boards
obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 4a648af..713a61b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2237,6 +2237,11 @@ static const TypeInfo spapr_machine_info = {
.driver = "spapr-pci-host-bridge",\
.property = "dynamic-reconfiguration",\
.value = "off",\
+ },\
+ {\
+ .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+ .property = "ddw",\
+ .value = stringify(off),\
},
#define SPAPR_COMPAT_2_2 \
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 54ab727..0509e50 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -136,6 +136,15 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr,
return ret;
}
+static void spapr_tce_table_pre_save(void *opaque)
+{
+ sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
+
+ tcet->migtable = tcet->table;
+}
+
+static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel);
+
static int spapr_tce_table_post_load(void *opaque, int version_id)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -144,22 +153,39 @@ static int spapr_tce_table_post_load(void *opaque, int version_id)
spapr_vio_set_bypass(tcet->vdev, tcet->bypass);
}
+ if (tcet->enabled) {
+ if (!tcet->table) {
+ tcet->enabled = false;
+ /* VFIO does not migrate so pass vfio_accel == false */
+ spapr_tce_table_do_enable(tcet, false);
+ }
+ memcpy(tcet->table, tcet->migtable,
+ tcet->nb_table * sizeof(tcet->table[0]));
+ free(tcet->migtable);
+ tcet->migtable = NULL;
+ }
+
return 0;
}
static const VMStateDescription vmstate_spapr_tce_table = {
.name = "spapr_iommu",
- .version_id = 2,
+ .version_id = 3,
.minimum_version_id = 2,
+ .pre_save = spapr_tce_table_pre_save,
.post_load = spapr_tce_table_post_load,
.fields = (VMStateField []) {
/* Sanity check */
VMSTATE_UINT32_EQUAL(liobn, sPAPRTCETable),
- VMSTATE_UINT32_EQUAL(nb_table, sPAPRTCETable),
/* IOMMU state */
+ VMSTATE_BOOL_V(enabled, sPAPRTCETable, 3),
+ VMSTATE_UINT64_V(bus_offset, sPAPRTCETable, 3),
+ VMSTATE_UINT32_V(page_shift, sPAPRTCETable, 3),
+ VMSTATE_UINT32(nb_table, sPAPRTCETable),
VMSTATE_BOOL(bypass, sPAPRTCETable),
- VMSTATE_VARRAY_UINT32(table, sPAPRTCETable, nb_table, 0, vmstate_info_uint64, uint64_t),
+ VMSTATE_VARRAY_UINT32_ALLOC(migtable, sPAPRTCETable, nb_table, 0,
+ vmstate_info_uint64, uint64_t),
VMSTATE_END_OF_LIST()
},
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 98d93fa..e5c91c9 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -778,6 +778,9 @@ static int spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb)
{
sphb->dma32_window_start = 0;
sphb->dma32_window_size = SPAPR_PCI_DMA32_SIZE;
+ sphb->windows_supported = SPAPR_PCI_DMA_MAX_WINDOWS;
+ sphb->page_size_mask = (1ULL << 12) | (1ULL << 16) | (1ULL << 24);
+ sphb->dma64_window_size = pow2ceil(ram_size);
if (sphb->vfio_num > 0) {
spapr_phb_vfio_dma_capabilities_update(sphb);
@@ -786,18 +789,40 @@ static int spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb)
return 0;
}
-static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
- uint32_t liobn, uint32_t page_shift,
- uint64_t window_size)
+int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t liobn, uint32_t page_shift,
+ uint64_t window_size)
{
uint64_t bus_offset = sphb->dma32_window_start;
sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn);
uint32_t nb_table = window_size >> page_shift;
+ int ret;
if (!nb_table) {
return -1;
}
+ if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) {
+ return -1;
+ }
+
+ if (sphb->ddw_enabled) {
+ if (sphb->vfio_num > 0) {
+ ret = spapr_phb_vfio_dma_init_window(sphb, page_shift, window_size,
+ &bus_offset);
+ if (ret) {
+ return ret;
+ }
+ } else if (SPAPR_PCI_DMA_WINDOW_NUM(liobn)) {
+ /*
+ * There is no VFIO so we choose a huge window address.
+ * If VFIO is added later, spapr_phb_dma_update() will fail
+ * and cause hotplug failure.
+ */
+ bus_offset = sphb->dma64_window_start;
+ }
+ }
+
spapr_tce_table_enable(tcet, bus_offset, page_shift, nb_table,
sphb->vfio_num > 0);
@@ -807,9 +832,14 @@ static int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
sPAPRTCETable *tcet)
{
+ int ret = 0;
+
+ if ((sphb->vfio_num > 0) && sphb->ddw_enabled) {
+ ret = spapr_phb_vfio_dma_remove_window(sphb, tcet);
+ }
spapr_tce_table_disable(tcet);
- return 0;
+ return ret;
}
int spapr_phb_dma_reset(sPAPRPHBState *sphb)
@@ -836,15 +866,44 @@ static int spapr_phb_hotplug_dma_sync(sPAPRPHBState *sphb)
{
int ret = 0, i;
sPAPRTCETable *tcet;
+ uint64_t bus_offset = 0;
spapr_phb_dma_capabilities_update(sphb);
+ if (sphb->vfio_num > 0) {
+ /*
+ * First vfio-pci device besides in a container with a default 32bit
+ * window. However the PHB might have removed a 32bit window and have
+ * created a 64bit window instead (not in addition) so vfio's window
+ * needs to be removed.
+ */
+ for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+ tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(sphb->index, i));
+ if (!tcet) {
+ continue;
+ }
+ spapr_phb_vfio_dma_remove_window(sphb, tcet);
+ }
+ }
+
for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
tcet = spapr_tce_find_by_liobn(SPAPR_PCI_LIOBN(sphb->index, i));
if (!tcet || !tcet->enabled) {
continue;
}
if ((tcet->fd >= 0) && (sphb->vfio_num > 0)) {
+ ret = spapr_phb_vfio_dma_init_window(sphb,
+ tcet->page_shift,
+ (uint64_t)tcet->nb_table <<
+ tcet->page_shift,
+ &bus_offset);
+ if (ret) {
+ break;
+ }
+ if (bus_offset != tcet->bus_offset) {
+ ret = -EFAULT;
+ break;
+ }
/*
* We got first vfio-pci device on accelerated table.
* VFIO acceleration is not possible.
@@ -1153,7 +1212,10 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
++phb->vfio_num;
if (vfio_num == 0) {
- spapr_phb_hotplug_dma_sync(phb);
+ if (spapr_phb_hotplug_dma_sync(phb)) {
+ error_setg(errp, "Failed to create DMA window(s)");
+ goto out;
+ }
}
}
}
@@ -1458,15 +1520,17 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
}
}
- tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
- if (!tcet) {
- error_setg(errp, "failed to create TCE table");
+ for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+ tcet = spapr_tce_new_table(DEVICE(sphb),
+ SPAPR_PCI_LIOBN(sphb->index, i));
+ if (!tcet) {
+ error_setg(errp, "spapr_tce_new_table failed");
return;
+ }
+ memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
+ spapr_tce_get_iommu(tcet), 0);
}
- memory_region_add_subregion(&sphb->iommu_root, 0,
- spapr_tce_get_iommu(tcet));
-
sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
}
@@ -1504,8 +1568,12 @@ static Property spapr_phb_properties[] = {
DEFINE_PROP_UINT64("io_win_addr", sPAPRPHBState, io_win_addr, -1),
DEFINE_PROP_UINT64("io_win_size", sPAPRPHBState, io_win_size,
SPAPR_PCI_IO_WIN_SIZE),
+ DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_window_start,
+ SPAPR_PCI_DMA64_START),
DEFINE_PROP_BOOL("dynamic-reconfiguration", sPAPRPHBState, dr_enabled,
true),
+ DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
+ DEFINE_PROP_UINT8("levels", sPAPRPHBState, levels, 0),
DEFINE_PROP_END_OF_LIST(),
};
@@ -1768,6 +1836,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
uint32_t interrupt_map_mask[] = {
cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+ uint32_t ddw_applicable[] = {
+ cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
+ cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
+ cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
+ };
+ uint32_t ddw_extensions[] = {
+ cpu_to_be32(1),
+ cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
+ };
sPAPRTCETable *tcet;
PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
sPAPRFDT s_fdt;
@@ -1792,6 +1869,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
_FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
_FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS));
+ /* Dynamic DMA window */
+ if (phb->ddw_enabled) {
+ _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
+ sizeof(ddw_applicable)));
+ _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
+ &ddw_extensions, sizeof(ddw_extensions)));
+ }
+
/* Build the interrupt-map, this must matches what is done
* in pci_spapr_map_irq
*/
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index b744c65..c214c32 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -24,6 +24,7 @@
#ifdef CONFIG_LINUX
#include "linux/vfio.h"
+#include "trace.h"
static Property spapr_phb_vfio_properties[] = {
DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
@@ -44,6 +45,93 @@ int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb)
sphb->dma32_window_start = info.dma32_window_start;
sphb->dma32_window_size = info.dma32_window_size;
+ if (sphb->ddw_enabled && (info.flags & VFIO_IOMMU_SPAPR_INFO_DDW)) {
+ sphb->windows_supported = info.ddw.max_dynamic_windows_supported;
+ sphb->page_size_mask = info.ddw.pgsizes;
+ sphb->dma64_window_size = pow2ceil(ram_size);
+ sphb->max_levels = info.ddw.levels;
+ } else {
+ /* If VFIO_IOMMU_INFO_DDW is not set, disable DDW */
+ sphb->ddw_enabled = false;
+ }
+
+ return ret;
+}
+
+static int spapr_phb_vfio_levels(uint32_t entries)
+{
+ unsigned pages = (entries * sizeof(uint64_t)) / getpagesize();
+ int levels;
+
+ if (pages <= 64) {
+ levels = 1;
+ } else if (pages <= 64*64) {
+ levels = 2;
+ } else if (pages <= 64*64*64) {
+ levels = 3;
+ } else {
+ levels = 4;
+ }
+
+ return levels;
+}
+
+int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t page_shift,
+ uint64_t window_size,
+ uint64_t *bus_offset)
+{
+ int ret;
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ .page_shift = page_shift,
+ .window_size = window_size,
+ .levels = sphb->levels,
+ .start_addr = 0,
+ };
+
+ /*
+ * Dynamic windows are supported, that means that there is no
+ * pre-created window and we have to create one.
+ */
+ if (!create.levels) {
+ create.levels = spapr_phb_vfio_levels(create.window_size >>
+ page_shift);
+ }
+
+ if (create.levels > sphb->max_levels) {
+ return -EINVAL;
+ }
+
+ ret = vfio_container_ioctl(&sphb->iommu_as,
+ VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ if (ret) {
+ return ret;
+ }
+ *bus_offset = create.start_addr;
+
+ trace_spapr_pci_vfio_init_window(page_shift, window_size, *bus_offset);
+
+ return 0;
+}
+
+int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb,
+ sPAPRTCETable *tcet)
+{
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ .start_addr = tcet->bus_offset
+ };
+ int ret;
+
+ ret = vfio_container_ioctl(&sphb->iommu_as,
+ VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ return ret;
+ }
+
+ trace_spapr_pci_vfio_remove_window(tcet->bus_offset);
+
return ret;
}
@@ -246,6 +334,20 @@ int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
return RTAS_OUT_HW_ERROR;
}
+int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t page_shift,
+ uint64_t window_size,
+ uint64_t *bus_offset)
+{
+ return -1;
+}
+
+int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb,
+ sPAPRTCETable *tcet)
+{
+ return -1;
+}
+
void spapr_phb_vfio_eeh_reenable(sPAPRPHBState *sphb)
{
}
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 0000000..0fba9ac
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,304 @@
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/error-report.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "trace.h"
+
+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
+{
+ sPAPRTCETable *tcet;
+
+ tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+ if (tcet && tcet->enabled) {
+ ++*(unsigned *)opaque;
+ }
+ return 0;
+}
+
+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
+{
+ unsigned ret = 0;
+
+ object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
+
+ return ret;
+}
+
+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
+{
+ sPAPRTCETable *tcet;
+
+ tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+ if (tcet && !tcet->enabled) {
+ *(uint32_t *)opaque = tcet->liobn;
+ return 1;
+ }
+ return 0;
+}
+
+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
+{
+ uint32_t liobn = 0;
+
+ object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
+
+ return liobn;
+}
+
+static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps,
+ uint64_t page_mask)
+{
+ int i, j;
+ uint32_t mask = 0;
+ const struct { int shift; uint32_t mask; } masks[] = {
+ { 12, RTAS_DDW_PGSIZE_4K },
+ { 16, RTAS_DDW_PGSIZE_64K },
+ { 24, RTAS_DDW_PGSIZE_16M },
+ { 25, RTAS_DDW_PGSIZE_32M },
+ { 26, RTAS_DDW_PGSIZE_64M },
+ { 27, RTAS_DDW_PGSIZE_128M },
+ { 28, RTAS_DDW_PGSIZE_256M },
+ { 34, RTAS_DDW_PGSIZE_16G },
+ };
+
+ for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+ for (j = 0; j < ARRAY_SIZE(masks); ++j) {
+ if ((sps[i].page_shift == masks[j].shift) &&
+ (page_mask & (1ULL << masks[j].shift))) {
+ mask |= masks[j].mask;
+ }
+ }
+ }
+
+ return mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ CPUPPCState *env = &cpu->env;
+ sPAPRPHBState *sphb;
+ uint64_t buid;
+ uint32_t avail, addr, pgmask = 0;
+ unsigned current;
+
+ if ((nargs != 3) || (nret != 5)) {
+ goto param_error_exit;
+ }
+
+ buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+ addr = rtas_ld(args, 0);
+ sphb = spapr_pci_find_phb(spapr, buid);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ current = spapr_phb_get_active_win_num(sphb);
+ avail = (sphb->windows_supported > current) ?
+ (sphb->windows_supported - current) : 0;
+
+ /* Work out supported page masks */
+ pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask);
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+ rtas_st(rets, 1, avail);
+
+ /*
+ * This is "Largest contiguous block of TCEs allocated specifically
+ * for (that is, are reserved for) this PE".
+ * Return the maximum number as all RAM was in 4K pages.
+ */
+ rtas_st(rets, 2, sphb->dma64_window_size >> SPAPR_TCE_PAGE_SHIFT);
+ rtas_st(rets, 3, pgmask);
+ rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
+
+ trace_spapr_iommu_ddw_query(buid, addr, avail, sphb->dma64_window_size,
+ pgmask);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ sPAPRTCETable *tcet = NULL;
+ uint32_t addr, page_shift, window_shift, liobn;
+ uint64_t buid;
+ long ret;
+
+ if ((nargs != 5) || (nret != 4)) {
+ goto param_error_exit;
+ }
+
+ buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+ addr = rtas_ld(args, 0);
+ sphb = spapr_pci_find_phb(spapr, buid);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ page_shift = rtas_ld(args, 3);
+ window_shift = rtas_ld(args, 4);
+ liobn = spapr_phb_get_free_liobn(sphb);
+
+ if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift))) {
+ goto hw_error_exit;
+ }
+
+ if (window_shift < page_shift) {
+ goto param_error_exit;
+ }
+
+ ret = spapr_phb_dma_init_window(sphb, liobn, page_shift,
+ 1ULL << window_shift);
+ tcet = spapr_tce_find_by_liobn(liobn);
+ trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
+ 1ULL << window_shift,
+ tcet ? tcet->bus_offset : 0xbaadf00d,
+ liobn, ret);
+ if (ret || !tcet) {
+ goto hw_error_exit;
+ }
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+ rtas_st(rets, 1, liobn);
+ rtas_st(rets, 2, tcet->bus_offset >> 32);
+ rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
+
+ return;
+
+hw_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ sPAPRTCETable *tcet;
+ uint32_t liobn;
+ long ret;
+
+ if ((nargs != 1) || (nret != 1)) {
+ goto param_error_exit;
+ }
+
+ liobn = rtas_ld(args, 0);
+ tcet = spapr_tce_find_by_liobn(liobn);
+ if (!tcet) {
+ goto param_error_exit;
+ }
+
+ sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ ret = spapr_phb_dma_remove_window(sphb, tcet);
+ trace_spapr_iommu_ddw_remove(liobn, ret);
+ if (ret) {
+ goto hw_error_exit;
+ }
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+ return;
+
+hw_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ uint64_t buid;
+ uint32_t addr;
+ long ret;
+
+ if ((nargs != 3) || (nret != 1)) {
+ goto param_error_exit;
+ }
+
+ buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+ addr = rtas_ld(args, 0);
+ sphb = spapr_pci_find_phb(spapr, buid);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ ret = spapr_phb_dma_reset(sphb);
+ trace_spapr_iommu_ddw_reset(buid, addr, ret);
+ if (ret) {
+ goto hw_error_exit;
+ }
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+
+ return;
+
+hw_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void spapr_rtas_ddw_init(void)
+{
+ spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
+ "ibm,query-pe-dma-window",
+ rtas_ibm_query_pe_dma_window);
+ spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
+ "ibm,create-pe-dma-window",
+ rtas_ibm_create_pe_dma_window);
+ spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
+ "ibm,remove-pe-dma-window",
+ rtas_ibm_remove_pe_dma_window);
+ spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
+ "ibm,reset-pe-dma-window",
+ rtas_ibm_reset_pe_dma_window);
+}
+
+type_init(spapr_rtas_ddw_init)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 64e0a54..f8ac42f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -952,6 +952,8 @@ int vfio_container_ioctl(AddressSpace *as, int req, void *param)
case VFIO_CHECK_EXTENSION:
case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
case VFIO_EEH_PE_OP:
+ case VFIO_IOMMU_SPAPR_TCE_CREATE:
+ case VFIO_IOMMU_SPAPR_TCE_REMOVE:
break;
default:
/* Return an error on unknown requests */
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 02708d9..859a85e 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -89,6 +89,13 @@ struct sPAPRPHBState {
uint32_t dma32_window_size;
unsigned vfio_num;
int32_t iommugroupid; /* obsolete */
+ bool ddw_enabled;
+ uint32_t windows_supported;
+ uint64_t page_size_mask;
+ uint64_t dma64_window_start;
+ uint64_t dma64_window_size;
+ uint8_t max_levels;
+ uint8_t levels;
QLIST_ENTRY(sPAPRPHBState) list;
};
@@ -111,7 +118,10 @@ struct sPAPRPHBState {
#define SPAPR_PCI_DMA32_SIZE 0x40000000
-#define SPAPR_PCI_DMA_MAX_WINDOWS 1
+#define SPAPR_PCI_DMA_MAX_WINDOWS 2
+
+/* Default 64bit dynamic window offset */
+#define SPAPR_PCI_DMA64_START 0x800000000000000ULL
static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
{
@@ -133,11 +143,20 @@ void spapr_pci_rtas_init(void);
sPAPRPHBState *spapr_pci_find_phb(sPAPRMachineState *spapr, uint64_t buid);
PCIDevice *spapr_pci_find_dev(sPAPRMachineState *spapr, uint64_t buid,
uint32_t config_addr);
+int spapr_phb_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t liobn, uint32_t page_shift,
+ uint64_t window_size);
int spapr_phb_dma_remove_window(sPAPRPHBState *sphb,
sPAPRTCETable *tcet);
int spapr_phb_dma_reset(sPAPRPHBState *sphb);
int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb);
+int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb,
+ uint32_t page_shift,
+ uint64_t window_size,
+ uint64_t *bus_offset);
+int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb,
+ sPAPRTCETable *tcet);
int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb,
PCIDevice *pdev, int option);
int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 8553592..8ff5c1e 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -419,6 +419,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
#define RTAS_OUT_NOT_SUPPORTED -3
#define RTAS_OUT_NOT_AUTHORIZED -9002
+/* DDW pagesize mask values from ibm,query-pe-dma-window */
+#define RTAS_DDW_PGSIZE_4K 0x01
+#define RTAS_DDW_PGSIZE_64K 0x02
+#define RTAS_DDW_PGSIZE_16M 0x04
+#define RTAS_DDW_PGSIZE_32M 0x08
+#define RTAS_DDW_PGSIZE_64M 0x10
+#define RTAS_DDW_PGSIZE_128M 0x20
+#define RTAS_DDW_PGSIZE_256M 0x40
+#define RTAS_DDW_PGSIZE_16G 0x80
+
/* RTAS tokens */
#define RTAS_TOKEN_BASE 0x2000
@@ -460,8 +470,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
#define RTAS_IBM_SET_SLOT_RESET (RTAS_TOKEN_BASE + 0x23)
#define RTAS_IBM_CONFIGURE_PE (RTAS_TOKEN_BASE + 0x24)
#define RTAS_IBM_SLOT_ERROR_DETAIL (RTAS_TOKEN_BASE + 0x25)
+#define RTAS_IBM_QUERY_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_IBM_CREATE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x27)
+#define RTAS_IBM_REMOVE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x28)
+#define RTAS_IBM_RESET_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x29)
-#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x2A)
/* RTAS ibm,get-system-parameter token values */
#define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20
@@ -561,6 +575,7 @@ struct sPAPRTCETable {
uint64_t bus_offset;
uint32_t page_shift;
uint64_t *table;
+ uint64_t *migtable;
bool bypass;
int fd;
MemoryRegion root, iommu;
diff --git a/trace-events b/trace-events
index 9d133d1..c3408bf 100644
--- a/trace-events
+++ b/trace-events
@@ -1302,6 +1302,8 @@ spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ %u"
spapr_pci_msi_retry(unsigned config_addr, unsigned req_num, unsigned max_irqs) "Guest device at %x asked %u, have only %u"
spapr_pci_dma_update(uint64_t liobn, long ret) "liobn=%"PRIx64" ret=%ld"
spapr_pci_dma_realloc_update(uint64_t liobn, long ret) "liobn=%"PRIx64" tcet=%ld"
+spapr_pci_vfio_init_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
+spapr_pci_vfio_remove_window(uint64_t off) "offset=%"PRIx64
# hw/pci/pci.c
pci_update_mappings_del(void *d, uint32_t bus, uint32_t func, uint32_t slot, int bar, uint64_t addr, uint64_t size) "d=%p %02x:%02x.%x %d,%#"PRIx64"+%#"PRIx64
@@ -1365,6 +1367,10 @@ spapr_iommu_pci_indirect(uint64_t liobn, uint64_t ioba, uint64_t tce, uint64_t i
spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64
spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
spapr_iommu_alloc_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
+spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32
+spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn, long ret) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32", ret = %ld"
+spapr_iommu_ddw_remove(uint32_t liobn, long ret) "liobn=%"PRIx32", ret = %ld"
+spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr, long ret) "buid=%"PRIx64" addr=%"PRIx32", ret = %ld"
# hw/ppc/ppc.c
ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
--
2.4.0.rc3.8.gfb3e7d5
^ permalink raw reply related [flat|nested] 12+ messages in thread