All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: Zhengxiao.zx@alibaba-inc.com, kevin.tian@intel.com,
	yi.l.liu@intel.com, cjia@nvidia.com, eskultet@redhat.com,
	ziye.yang@intel.com, cohuck@redhat.com,
	shuangtai.tst@alibaba-inc.com, qemu-devel@nongnu.org,
	zhi.a.wang@intel.com, mlevitsk@redhat.com, pasic@linux.ibm.com,
	aik@ozlabs.ru, Kirti Wankhede <kwankhede@nvidia.com>,
	eauger@redhat.com, felipe@nutanix.com,
	jonathan.davies@nutanix.com, yan.y.zhao@intel.com,
	changpeng.liu@intel.com, Ken.Xue@amd.com
Subject: Re: [PATCH v16 QEMU 14/16] vfio: Add vfio_listener_log_sync to mark dirty pages
Date: Wed, 1 Apr 2020 20:08:08 +0100	[thread overview]
Message-ID: <20200401190808.GK52559@work-vm> (raw)
In-Reply-To: <20200326134650.0c7245e9@x1.home>

* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Wed, 25 Mar 2020 02:39:12 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > vfio_listener_log_sync gets list of dirty pages from container using
> > VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all
> > devices are stopped and saving state.
> > Return early for the RAM block section of mapped MMIO region.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > ---
> >  hw/vfio/common.c     | 200 +++++++++++++++++++++++++++++++++++++++++++++++++--
> >  hw/vfio/trace-events |   1 +
> >  2 files changed, 196 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index 4a2f0d6a2233..6d41e1ac5c2f 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -29,6 +29,7 @@
> >  #include "hw/vfio/vfio.h"
> >  #include "exec/address-spaces.h"
> >  #include "exec/memory.h"
> > +#include "exec/ram_addr.h"
> >  #include "hw/hw.h"
> >  #include "qemu/error-report.h"
> >  #include "qemu/main-loop.h"
> > @@ -38,6 +39,7 @@
> >  #include "sysemu/reset.h"
> >  #include "trace.h"
> >  #include "qapi/error.h"
> > +#include "migration/migration.h"
> >  
> >  VFIOGroupList vfio_group_list =
> >      QLIST_HEAD_INITIALIZER(vfio_group_list);
> > @@ -288,6 +290,28 @@ const MemoryRegionOps vfio_region_ops = {
> >  };
> >  
> >  /*
> > + * Device state interfaces
> > + */
> > +
> > +static bool vfio_devices_are_stopped_and_saving(void)
> > +{
> > +    VFIOGroup *group;
> > +    VFIODevice *vbasedev;
> > +
> > +    QLIST_FOREACH(group, &vfio_group_list, next) {
> > +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> > +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> > +                !(vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> > +                continue;
> > +            } else {
> > +                return false;
> > +            }
> > +        }
> > +    }
> > +    return true;
> > +}
> > +
> > +/*
> >   * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> >   */
> >  static int vfio_dma_unmap(VFIOContainer *container,
> > @@ -408,8 +432,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
> >  }
> >  
> >  /* Called with rcu_read_lock held.  */
> > -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> > -                           bool *read_only)
> > +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> > +                               ram_addr_t *ram_addr, bool *read_only)
> >  {
> >      MemoryRegion *mr;
> >      hwaddr xlat;
> > @@ -440,9 +464,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> >          return false;
> >      }
> >  
> > -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> > -    *read_only = !writable || mr->readonly;
> > +    if (vaddr) {
> > +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> > +    }
> >  
> > +    if (ram_addr) {
> > +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
> > +    }
> > +
> > +    if (read_only) {
> > +        *read_only = !writable || mr->readonly;
> > +    }
> >      return true;
> >  }
> >  
> > @@ -467,7 +499,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >      rcu_read_lock();
> >  
> >      if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> > -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
> > +        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
> >              goto out;
> >          }
> >          /*
> > @@ -813,9 +845,167 @@ static void vfio_listener_region_del(MemoryListener *listener,
> >      }
> >  }
> >  
> > +static int vfio_get_dirty_bitmap(MemoryListener *listener,
> > +                                 MemoryRegionSection *section)
> > +{
> > +    VFIOContainer *container = container_of(listener, VFIOContainer, listener);
> > +    VFIOGuestIOMMU *giommu;
> > +    IOMMUTLBEntry iotlb;
> > +    hwaddr granularity, address_limit, iova;
> > +    int ret;
> > +
> > +    if (memory_region_is_iommu(section->mr)) {
> > +        QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
> > +            if (MEMORY_REGION(giommu->iommu) == section->mr &&
> > +                giommu->n.start == section->offset_within_region) {
> > +                break;
> > +            }
> > +        }
> > +
> > +        if (!giommu) {
> > +            return -EINVAL;
> > +        }
> > +    }
> > +
> > +    if (memory_region_is_iommu(section->mr)) {
> > +        granularity = memory_region_iommu_get_min_page_size(giommu->iommu);
> > +
> > +        address_limit = MIN(int128_get64(section->size),
> > +                            memory_region_iommu_get_address_limit(giommu->iommu,
> > +                                                 int128_get64(section->size)));
> > +    } else {
> > +        granularity = memory_region_size(section->mr);
> > +        address_limit = int128_get64(section->size);
> > +    }
> > +
> > +    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> > +
> > +    RCU_READ_LOCK_GUARD();
> > +
> > +    while (iova < address_limit) {
> > +        struct vfio_iommu_type1_dirty_bitmap *dbitmap;
> > +        struct vfio_iommu_type1_dirty_bitmap_get *range;
> > +        ram_addr_t start, pages;
> > +        uint64_t iova_xlat, size;
> > +
> > +        if (memory_region_is_iommu(section->mr)) {
> > +            iotlb = address_space_get_iotlb_entry(container->space->as, iova,
> > +                                                 true, MEMTXATTRS_UNSPECIFIED);
> > +            if ((iotlb.target_as == NULL) || (iotlb.addr_mask == 0)) {
> > +                if ((iova + granularity) < iova) {
> > +                    break;
> > +                }
> > +                iova += granularity;
> > +                continue;
> > +            }
> > +            iova_xlat = iotlb.iova + giommu->iommu_offset;
> > +            size = iotlb.addr_mask + 1;
> > +        } else {
> > +            iova_xlat = iova;
> > +            size = address_limit;
> > +        }
> > +
> > +        dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
> > +        if (!dbitmap) {
> 
> AIUI, QEMU aborts if this fails, so no need to check.  Because reasons.

It does if you use g_malloc0; however if the data is large you can
use g_try_malloc0 and then it will return NULL and you can fail the
migration rather than nuking the VM.
(We often argue whether it's worth testing or not, I think generally 
if the size is 'large' or user defined then use try, if it's small
then assume it works.  We've never defined small or large; but somewhere
around a few pages is about right.

Dave

> > +            return -ENOMEM;
> > +        }
> > +
> > +        dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
> > +        dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
> > +        range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
> > +        range->iova = iova_xlat;
> > +        range->size = size;
> > +
> > +        /*
> > +         * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
> > +         * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to
> > +         * TARGET_PAGE_SIZE.
> > +         */
> > +        range->bitmap.pgsize = TARGET_PAGE_SIZE;
> > +
> > +        /*
> > +         * Comment from kvm_physical_sync_dirty_bitmap() since same applies here
> > +         * XXX bad kernel interface alert
> > +         * For dirty bitmap, kernel allocates array of size aligned to
> > +         * bits-per-long.  But for case when the kernel is 64bits and
> > +         * the userspace is 32bits, userspace can't align to the same
> > +         * bits-per-long, since sizeof(long) is different between kernel
> > +         * and user space.  This way, userspace will provide buffer which
> > +         * may be 4 bytes less than the kernel will use, resulting in
> > +         * userspace memory corruption (which is not detectable by valgrind
> > +         * too, in most cases).
> > +         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
> > +         * a hope that sizeof(long) won't become >8 any time soon.
> > +         */
> 
> This seems like the problem we've avoided by defining our bitmap as an
> array of u64s rather than an array of longs.  Does this comment really
> still apply?
> 
> > +
> > +        pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS;
> > +        range->bitmap.size = ROUND_UP(pages, 64) / 8;
> 
> ROUND_UP(npages/8, sizeof(u64))?
> 
> > +        range->bitmap.data = g_malloc0(range->bitmap.size);
> 
> We don't require this to be pre-zero'd currently.
> 
> > +        if (range->bitmap.data == NULL) {
> 
> Same as above.  Seems strange to me too.
> 
> > +            error_report("Error allocating bitmap of size 0x%llx",
> > +                         range->bitmap.size);
> > +            ret = -ENOMEM;
> > +            goto err_out;
> > +        }
> > +
> > +        ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
> > +        if (ret) {
> > +            error_report("Failed to get dirty bitmap for iova: 0x%llx "
> > +                         "size: 0x%llx err: %d",
> > +                         range->iova, range->size, errno);
> > +            goto err_out;
> > +        }
> > +
> > +        if (memory_region_is_iommu(section->mr)) {
> > +            if (!vfio_get_xlat_addr(&iotlb, NULL, &start, NULL)) {
> > +                ret = -EINVAL;
> > +                goto err_out;
> > +            }
> > +        } else {
> > +            start = memory_region_get_ram_addr(section->mr) +
> > +                    section->offset_within_region + iova -
> > +                    TARGET_PAGE_ALIGN(section->offset_within_address_space);
> > +        }
> > +
> > +        cpu_physical_memory_set_dirty_lebitmap((uint64_t *)range->bitmap.data,
> > +                                               start, pages);
> > +
> > +        trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
> > +                                    range->bitmap.size, start);
> > +err_out:
> > +        g_free(range->bitmap.data);
> > +        g_free(dbitmap);
> > +
> > +        if (ret) {
> > +            return ret;
> > +        }
> > +
> > +        if ((iova + size) < iova) {
> > +            break;
> > +        }
> > +
> > +        iova += size;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void vfio_listerner_log_sync(MemoryListener *listener,
> > +        MemoryRegionSection *section)
> > +{
> > +    if (vfio_listener_skipped_section(section)) {
> > +        return;
> > +    }
> > +
> > +    if (vfio_devices_are_stopped_and_saving()) {
> > +        vfio_get_dirty_bitmap(listener, section);
> > +    }
> > +}
> > +
> >  static const MemoryListener vfio_memory_listener = {
> >      .region_add = vfio_listener_region_add,
> >      .region_del = vfio_listener_region_del,
> > +    .log_sync = vfio_listerner_log_sync,
> >  };
> >  
> >  static void vfio_listener_release(VFIOContainer *container)
> > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> > index ac065b559f4e..bc8f35ee9356 100644
> > --- a/hw/vfio/trace-events
> > +++ b/hw/vfio/trace-events
> > @@ -160,3 +160,4 @@ vfio_save_complete_precopy(char *name) " (%s)"
> >  vfio_load_device_config_state(char *name) " (%s)"
> >  vfio_load_state(char *name, uint64_t data) " (%s) data 0x%"PRIx64
> >  vfio_load_state_device_data(char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
> > +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK



  reply	other threads:[~2020-04-01 19:09 UTC|newest]

Thread overview: 74+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-24 21:08 [PATCH v16 QEMU 00/16] Add migration support for VFIO devices Kirti Wankhede
2020-03-24 21:08 ` [PATCH v16 QEMU 01/16] vfio: KABI for migration interface - Kernel header placeholder Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 02/16] vfio: Add function to unmap VFIO region Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 03/16] vfio: Add vfio_get_object callback to VFIODeviceOps Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices Kirti Wankhede
2020-03-25 19:56   ` Alex Williamson
2020-03-26 17:29     ` Dr. David Alan Gilbert
2020-03-26 17:38       ` Alex Williamson
2020-05-04 23:18     ` Kirti Wankhede
2020-05-05  4:37       ` Alex Williamson
2020-05-06  6:11         ` Yan Zhao
2020-05-06 19:48           ` Kirti Wankhede
2020-05-06 20:03             ` Alex Williamson
2020-05-07  5:40               ` Kirti Wankhede
2020-05-07 18:14                 ` Alex Williamson
2020-03-26 17:46   ` Dr. David Alan Gilbert
2020-05-04 23:19     ` Kirti Wankhede
2020-04-07  4:10   ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2020-05-04 23:21     ` Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 05/16] vfio: Add migration region initialization and finalize function Kirti Wankhede
2020-03-26 17:52   ` Dr. David Alan Gilbert
2020-05-04 23:19     ` Kirti Wankhede
2020-05-19 19:32       ` Dr. David Alan Gilbert
2020-03-24 21:09 ` [PATCH v16 QEMU 06/16] vfio: Add VM state change handler to know state of VM Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 07/16] vfio: Add migration state change notifier Kirti Wankhede
2020-04-01 11:27   ` Dr. David Alan Gilbert
2020-05-04 23:20     ` Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 08/16] vfio: Register SaveVMHandlers for VFIO device Kirti Wankhede
2020-03-25 21:02   ` Alex Williamson
2020-05-04 23:19     ` Kirti Wankhede
2020-05-05  4:37       ` Alex Williamson
2020-05-06  6:38         ` Yan Zhao
2020-05-06  9:58           ` Cornelia Huck
2020-05-06 16:53             ` Dr. David Alan Gilbert
2020-05-06 19:30               ` Kirti Wankhede
2020-05-07  6:37                 ` Cornelia Huck
2020-05-07 20:29                 ` Alex Williamson
2020-04-01 17:36   ` Dr. David Alan Gilbert
2020-05-04 23:20     ` Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 09/16] vfio: Add save state functions to SaveVMHandlers Kirti Wankhede
2020-03-25 22:03   ` Alex Williamson
2020-05-04 23:18     ` Kirti Wankhede
2020-05-05  4:37       ` Alex Williamson
2020-05-11  9:53         ` Kirti Wankhede
2020-05-11 15:59           ` Alex Williamson
2020-05-12  2:06           ` Yan Zhao
2020-05-09  5:31   ` Yan Zhao
2020-05-11 10:22     ` Kirti Wankhede
2020-05-12  0:50       ` Yan Zhao
2020-03-24 21:09 ` [PATCH v16 QEMU 10/16] vfio: Add load " Kirti Wankhede
2020-03-25 22:36   ` Alex Williamson
2020-04-01 18:58   ` Dr. David Alan Gilbert
2020-05-04 23:20     ` Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 11/16] iommu: add callback to get address limit IOMMU supports Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 12/16] memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled Kirti Wankhede
2020-04-01 19:00   ` Dr. David Alan Gilbert
2020-04-01 19:42     ` Alex Williamson
2020-03-24 21:09 ` [PATCH v16 QEMU 13/16] vfio: Add function to start and stop dirty pages tracking Kirti Wankhede
2020-03-26 19:10   ` Alex Williamson
2020-05-04 23:20     ` Kirti Wankhede
2020-04-01 19:03   ` Dr. David Alan Gilbert
2020-05-04 23:21     ` Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 14/16] vfio: Add vfio_listener_log_sync to mark dirty pages Kirti Wankhede
2020-03-25  2:19   ` Yan Zhao
2020-03-26 19:46   ` Alex Williamson
2020-04-01 19:08     ` Dr. David Alan Gilbert [this message]
2020-04-01  5:50   ` Yan Zhao
2020-04-03 20:11     ` Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 15/16] vfio: Add ioctl to get dirty pages bitmap during dma unmap Kirti Wankhede
2020-03-24 21:09 ` [PATCH v16 QEMU 16/16] vfio: Make vfio-pci device migration capable Kirti Wankhede
2020-03-24 23:36 ` [PATCH v16 QEMU 00/16] Add migration support for VFIO devices no-reply
2020-03-31 18:34 ` Alex Williamson
2020-04-01  6:41   ` Yan Zhao
2020-04-01 18:34     ` Alex Williamson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200401190808.GK52559@work-vm \
    --to=dgilbert@redhat.com \
    --cc=Ken.Xue@amd.com \
    --cc=Zhengxiao.zx@alibaba-inc.com \
    --cc=aik@ozlabs.ru \
    --cc=alex.williamson@redhat.com \
    --cc=changpeng.liu@intel.com \
    --cc=cjia@nvidia.com \
    --cc=cohuck@redhat.com \
    --cc=eauger@redhat.com \
    --cc=eskultet@redhat.com \
    --cc=felipe@nutanix.com \
    --cc=jonathan.davies@nutanix.com \
    --cc=kevin.tian@intel.com \
    --cc=kwankhede@nvidia.com \
    --cc=mlevitsk@redhat.com \
    --cc=pasic@linux.ibm.com \
    --cc=qemu-devel@nongnu.org \
    --cc=shuangtai.tst@alibaba-inc.com \
    --cc=yan.y.zhao@intel.com \
    --cc=yi.l.liu@intel.com \
    --cc=zhi.a.wang@intel.com \
    --cc=ziye.yang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.