Linux virtualization list
 help / color / mirror / Atom feed
* Re: [PATCH 13/15] drm/vboxvideo: Convert vboxvideo driver to Simple TTM
From: Hans de Goede @ 2019-04-09  7:09 UTC (permalink / raw)
  To: Thomas Zimmermann, daniel, airlied, kraxel, christian.koenig,
	ray.huang, Jerry.Zhang, z.liuxinliang, zourongrong,
	kong.kongxinwei, puck.chen
  Cc: dri-devel, virtualization
In-Reply-To: <20190408092144.4548-14-tzimmermann@suse.de>

Hi,

On 08-04-19 11:21, Thomas Zimmermann wrote:
> Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>

Patch looks good to me (although perhaps it needs a commit msg):

Reviewed-by: Hans de Goede <hdegoede@redhat.com>

Regards,

Hans



> ---
>   drivers/gpu/drm/vboxvideo/Kconfig    |   1 +
>   drivers/gpu/drm/vboxvideo/vbox_drv.h |   6 +-
>   drivers/gpu/drm/vboxvideo/vbox_ttm.c | 123 ++-------------------------
>   3 files changed, 12 insertions(+), 118 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vboxvideo/Kconfig b/drivers/gpu/drm/vboxvideo/Kconfig
> index c1ca87df81df..e29df360978d 100644
> --- a/drivers/gpu/drm/vboxvideo/Kconfig
> +++ b/drivers/gpu/drm/vboxvideo/Kconfig
> @@ -4,6 +4,7 @@ config DRM_VBOXVIDEO
>   	select DRM_KMS_HELPER
>   	select DRM_TTM
>   	select DRM_GEM_TTM_HELPER
> +	select DRM_SIMPLE_TTM_HELPER
>   	select GENERIC_ALLOCATOR
>   	help
>   	  This is a KMS driver for the virtual Graphics Card used in
> diff --git a/drivers/gpu/drm/vboxvideo/vbox_drv.h b/drivers/gpu/drm/vboxvideo/vbox_drv.h
> index 7db4e961805d..d4cfcc6339ef 100644
> --- a/drivers/gpu/drm/vboxvideo/vbox_drv.h
> +++ b/drivers/gpu/drm/vboxvideo/vbox_drv.h
> @@ -20,6 +20,8 @@
>   #include <drm/drm_gem.h>
>   #include <drm/drm_gem_ttm_helper.h>
>   
> +#include <drm/drm_simple_ttm_helper.h>
> +
>   #include <drm/ttm/ttm_bo_api.h>
>   #include <drm/ttm/ttm_bo_driver.h>
>   #include <drm/ttm/ttm_placement.h>
> @@ -78,9 +80,7 @@ struct vbox_private {
>   
>   	int fb_mtrr;
>   
> -	struct {
> -		struct ttm_bo_device bdev;
> -	} ttm;
> +	struct drm_simple_ttm ttm;
>   
>   	struct mutex hw_mutex; /* protects modeset and accel/vbva accesses */
>   	struct work_struct hotplug_work;
> diff --git a/drivers/gpu/drm/vboxvideo/vbox_ttm.c b/drivers/gpu/drm/vboxvideo/vbox_ttm.c
> index a1d64e1ea90c..115ec44636ab 100644
> --- a/drivers/gpu/drm/vboxvideo/vbox_ttm.c
> +++ b/drivers/gpu/drm/vboxvideo/vbox_ttm.c
> @@ -11,128 +11,25 @@
>   #include <drm/ttm/ttm_page_alloc.h>
>   #include "vbox_drv.h"
>   
> -static inline struct vbox_private *vbox_bdev(struct ttm_bo_device *bd)
> -{
> -	return container_of(bd, struct vbox_private, ttm.bdev);
> -}
> -
> -static int
> -vbox_bo_init_mem_type(struct ttm_bo_device *bdev, u32 type,
> -		      struct ttm_mem_type_manager *man)
> -{
> -	switch (type) {
> -	case TTM_PL_SYSTEM:
> -		man->flags = TTM_MEMTYPE_FLAG_MAPPABLE;
> -		man->available_caching = TTM_PL_MASK_CACHING;
> -		man->default_caching = TTM_PL_FLAG_CACHED;
> -		break;
> -	case TTM_PL_VRAM:
> -		man->func = &ttm_bo_manager_func;
> -		man->flags = TTM_MEMTYPE_FLAG_FIXED | TTM_MEMTYPE_FLAG_MAPPABLE;
> -		man->available_caching = TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_WC;
> -		man->default_caching = TTM_PL_FLAG_WC;
> -		break;
> -	default:
> -		DRM_ERROR("Unsupported memory type %u\n", (unsigned int)type);
> -		return -EINVAL;
> -	}
> -
> -	return 0;
> -}
> -
> -static int vbox_ttm_io_mem_reserve(struct ttm_bo_device *bdev,
> -				   struct ttm_mem_reg *mem)
> -{
> -	struct ttm_mem_type_manager *man = &bdev->man[mem->mem_type];
> -	struct vbox_private *vbox = vbox_bdev(bdev);
> -
> -	mem->bus.addr = NULL;
> -	mem->bus.offset = 0;
> -	mem->bus.size = mem->num_pages << PAGE_SHIFT;
> -	mem->bus.base = 0;
> -	mem->bus.is_iomem = false;
> -	if (!(man->flags & TTM_MEMTYPE_FLAG_MAPPABLE))
> -		return -EINVAL;
> -	switch (mem->mem_type) {
> -	case TTM_PL_SYSTEM:
> -		/* system memory */
> -		return 0;
> -	case TTM_PL_VRAM:
> -		mem->bus.offset = mem->start << PAGE_SHIFT;
> -		mem->bus.base = pci_resource_start(vbox->ddev.pdev, 0);
> -		mem->bus.is_iomem = true;
> -		break;
> -	default:
> -		return -EINVAL;
> -	}
> -	return 0;
> -}
> -
> -static void vbox_ttm_io_mem_free(struct ttm_bo_device *bdev,
> -				 struct ttm_mem_reg *mem)
> -{
> -}
> -
> -static void vbox_ttm_backend_destroy(struct ttm_tt *tt)
> -{
> -	ttm_tt_fini(tt);
> -	kfree(tt);
> -}
> -
> -static struct ttm_backend_func vbox_tt_backend_func = {
> -	.destroy = &vbox_ttm_backend_destroy,
> -};
> -
> -static struct ttm_tt *vbox_ttm_tt_create(struct ttm_buffer_object *bo,
> -					 u32 page_flags)
> -{
> -	struct ttm_tt *tt;
> -
> -	tt = kzalloc(sizeof(*tt), GFP_KERNEL);
> -	if (!tt)
> -		return NULL;
> -
> -	tt->func = &vbox_tt_backend_func;
> -	if (ttm_tt_init(tt, bo, page_flags)) {
> -		kfree(tt);
> -		return NULL;
> -	}
> -
> -	return tt;
> -}
> -
> -static struct ttm_bo_driver vbox_bo_driver = {
> -	.ttm_tt_create = vbox_ttm_tt_create,
> -	.init_mem_type = vbox_bo_init_mem_type,
> -	.eviction_valuable = ttm_bo_eviction_valuable,
> +static const struct drm_simple_ttm_funcs vbox_simple_ttm_funcs = {
>   	.evict_flags = drm_gem_ttm_bo_driver_evict_flags,
> -	.verify_access = drm_gem_ttm_bo_driver_verify_access,
> -	.io_mem_reserve = &vbox_ttm_io_mem_reserve,
> -	.io_mem_free = &vbox_ttm_io_mem_free,
> +	.verify_access = drm_gem_ttm_bo_driver_verify_access
>   };
>   
>   int vbox_mm_init(struct vbox_private *vbox)
>   {
>   	int ret;
>   	struct drm_device *dev = &vbox->ddev;
> -	struct ttm_bo_device *bdev = &vbox->ttm.bdev;
>   
> -	ret = ttm_bo_device_init(&vbox->ttm.bdev,
> -				 &vbox_bo_driver,
> -				 dev->anon_inode->i_mapping,
> -				 true);
> +	ret = drm_simple_ttm_init(&vbox->ttm, dev,
> +				  pci_resource_start(dev->pdev, 0),
> +				  vbox->available_vram_size,
> +				  &vbox_simple_ttm_funcs);
>   	if (ret) {
> -		DRM_ERROR("Error initialising bo driver; %d\n", ret);
> +		DRM_ERROR("Error initializing Simple TTM; %d\n", ret);
>   		return ret;
>   	}
>   
> -	ret = ttm_bo_init_mm(bdev, TTM_PL_VRAM,
> -			     vbox->available_vram_size >> PAGE_SHIFT);
> -	if (ret) {
> -		DRM_ERROR("Failed ttm VRAM init: %d\n", ret);
> -		goto err_device_release;
> -	}
> -
>   #ifdef DRM_MTRR_WC
>   	vbox->fb_mtrr = drm_mtrr_add(pci_resource_start(dev->pdev, 0),
>   				     pci_resource_len(dev->pdev, 0),
> @@ -142,10 +39,6 @@ int vbox_mm_init(struct vbox_private *vbox)
>   					 pci_resource_len(dev->pdev, 0));
>   #endif
>   	return 0;
> -
> -err_device_release:
> -	ttm_bo_device_release(&vbox->ttm.bdev);
> -	return ret;
>   }
>   
>   void vbox_mm_fini(struct vbox_private *vbox)
> @@ -157,7 +50,7 @@ void vbox_mm_fini(struct vbox_private *vbox)
>   #else
>   	arch_phys_wc_del(vbox->fb_mtrr);
>   #endif
> -	ttm_bo_device_release(&vbox->ttm.bdev);
> +	drm_simple_ttm_cleanup(&vbox->ttm);
>   }
>   
>   int vbox_mmap(struct file *filp, struct vm_area_struct *vma)
> 

^ permalink raw reply

* Re: [PATCH 00/15] Share TTM code among framebuffer drivers
From: kraxel @ 2019-04-09  7:12 UTC (permalink / raw)
  To: Thomas Zimmermann
  Cc: airlied@linux.ie, puck.chen@hisilicon.com,
	dri-devel@lists.freedesktop.org,
	virtualization@lists.linux-foundation.org,
	z.liuxinliang@hisilicon.com, hdegoede@redhat.com,
	kong.kongxinwei@hisilicon.com, Huang, Ray, zourongrong@gmail.com,
	daniel@ffwll.ch, Zhang, Jerry, Koenig, Christian
In-Reply-To: <096a70a7-ed24-2161-29e9-1907221b8a64@suse.de>

  Hi,

> If not for TTM, what would be the alternative? One VMA manager per
> memory region per device?

Depends pretty much on the device.

The cirrus is a display device with only 4 MB of vram.  You can't fit
much in there.  A single 1024x768 @ 24bpp framebuffer needs more 50%
of the video memory already.  Which is why the cirrus driver (before the
rewrite) had to migrate buffers from/to vram on every page flip[1].  Which
is one[2] of the reasons why cirrus (after rewrite) doesn't ttm-manage the
vram any more.  gem objects are managed with the shmem helpers instead
and the active framebuffer is blitted to vram.

The qemu stdvga (bochs driver) has 16 MB vram by default and can be
configured to have up to 256 MB.  Plenty of room even for multiple 4k
framebuffers if needed.  So for the bochs driver all the ttm bo
migration logic is not needed, it could just store everything in vram.

A set of drm_gem_vram_* helpers would do the job for bochs.

I'd expect the same applies to the vbox driver.

Dunno about the other drm drivers and the fbdev drivers you plan to
migrate over.

cheers,
  Gerd

[1] Note: The page-flip migration logic is present in some of the other
    drivers too, not sure whenever they actually need that due to being low
    on vram too or whenever they just copied the old cirrus code ...

[2] The other reason is that this allow to convert formats at blit time,
    which helps to deal with some cirrus hardware limitations.

^ permalink raw reply

* Re: [PATCH 00/15] Share TTM code among framebuffer drivers
From: Dave Airlie @ 2019-04-09  7:42 UTC (permalink / raw)
  To: kraxel@redhat.com
  Cc: airlied@linux.ie, puck.chen@hisilicon.com, Zhang, Jerry,
	dri-devel@lists.freedesktop.org,
	virtualization@lists.linux-foundation.org,
	z.liuxinliang@hisilicon.com, hdegoede@redhat.com,
	kong.kongxinwei@hisilicon.com, Huang, Ray, Thomas Zimmermann,
	zourongrong@gmail.com, Koenig, Christian
In-Reply-To: <20190409071249.ngzarvjlztx4jwbh@sirius.home.kraxel.org>

On Tue, 9 Apr 2019 at 17:12, kraxel@redhat.com <kraxel@redhat.com> wrote:
>
>   Hi,
>
> > If not for TTM, what would be the alternative? One VMA manager per
> > memory region per device?
>
> Depends pretty much on the device.
>
> The cirrus is a display device with only 4 MB of vram.  You can't fit
> much in there.  A single 1024x768 @ 24bpp framebuffer needs more 50%
> of the video memory already.  Which is why the cirrus driver (before the
> rewrite) had to migrate buffers from/to vram on every page flip[1].  Which
> is one[2] of the reasons why cirrus (after rewrite) doesn't ttm-manage the
> vram any more.  gem objects are managed with the shmem helpers instead
> and the active framebuffer is blitted to vram.
>
> The qemu stdvga (bochs driver) has 16 MB vram by default and can be
> configured to have up to 256 MB.  Plenty of room even for multiple 4k
> framebuffers if needed.  So for the bochs driver all the ttm bo
> migration logic is not needed, it could just store everything in vram.

To clarify I assume you mean it doesn't need the migrate each bo
logic, but it still needs the when VRAM fills up migrate stuff logic.

Dave.

^ permalink raw reply

* Re: [PATCH 00/15] Share TTM code among framebuffer drivers
From: kraxel @ 2019-04-09  8:29 UTC (permalink / raw)
  To: Dave Airlie
  Cc: airlied@linux.ie, puck.chen@hisilicon.com, Zhang, Jerry,
	dri-devel@lists.freedesktop.org,
	virtualization@lists.linux-foundation.org,
	z.liuxinliang@hisilicon.com, hdegoede@redhat.com,
	kong.kongxinwei@hisilicon.com, Huang, Ray, Thomas Zimmermann,
	zourongrong@gmail.com, Koenig, Christian
In-Reply-To: <CAPM=9tzg08MU0n7EhV=FVSF0FWN3qway76P5=JX1DegYHrcwvg@mail.gmail.com>

  Hi,

> > The qemu stdvga (bochs driver) has 16 MB vram by default and can be
> > configured to have up to 256 MB.  Plenty of room even for multiple 4k
> > framebuffers if needed.  So for the bochs driver all the ttm bo
> > migration logic is not needed, it could just store everything in vram.
> 
> To clarify I assume you mean it doesn't need the migrate each bo
> logic, but it still needs the when VRAM fills up migrate stuff logic.

I think even the "when vram fills up" logic isn't that important.  The
driver has no acceleration so there is nothing to store beside dumb
framebuffers, and the vram size can easily be increased if needed.

cheers,
  Gerd

^ permalink raw reply

* Re: [PATCH RFC 0/4] vsock/virtio: optimizations to increase the throughput
From: Jason Wang @ 2019-04-09  8:36 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kvm, Michael S. Tsirkin, netdev, linux-kernel, virtualization,
	David S. Miller, Stefano Garzarella
In-Reply-To: <20190408094416.GQ15001@stefanha-x1.localdomain>


On 2019/4/8 下午5:44, Stefan Hajnoczi wrote:
> On Mon, Apr 08, 2019 at 02:43:28PM +0800, Jason Wang wrote:
>> Another thing that may help is to implement sendpage(), which will greatly
>> improve the performance.
> I can't find documentation for ->sendpage().  Is the idea that you get a
> struct page for the payload and can do zero-copy tx?


Yes.


>   (And can userspace
> still write to the page, invalidating checksums in the header?)
>
> Stefan


Userspace can still write to the page, but for correctness (e.g in the 
case of SPLICE_F_GIFT describe by vmsplice(2)), it should not do this. 
For vmsplice, it may hard to detect the time to reuse the page. Maybe we 
MSG_ZEROCOPY[1] is better.

Anyway, sendpage() could be still useful for sendfile() or splice().

Thanks

[1] https://netdevconf.org/2.1/papers/netdev.pdf

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH] drm/qxl: drop prime import/export callbacks
From: Gerd Hoffmann @ 2019-04-09  8:47 UTC (permalink / raw)
  To: David Airlie
  Cc: David Airlie, open list:DRM DRIVER FOR QXL VIRTUAL GPU, open list,
	dri-devel, open list:DRM DRIVER FOR QXL VIRTUAL GPU
In-Reply-To: <CAMwc25q-7rfR=31R=FEVC47z2HvDK5xN+aOKqq00yoXSpudusQ@mail.gmail.com>

  Hi,

> > Should we add something like DRM_PRIME_CAP_SAME_DEVICE?
> 
> Yeah I expect we need some sort of same device only capability, so
> that dri3 userspace can work.
> 
> If we just fail importing in these cases what happens? userspace just
> gets confused, I know we used to print a backtrace if we hit the mmap
> path, but if we didn't do that what happens?

Well, we printed a backtrace and returned -EINVAL.  So it looked a bit
scary due to the backtrace which is usually printed for more serious
problems.  But we also returned a proper error code.

Userspace was not happy.  It was wayland (gnome-shell) which ran into it
it, and it didn't came up with a working display.

cheers,
  Gerd

^ permalink raw reply

* Re: [PATCH] virtio: Honour 'may_reduce_num' in vring_create_virtqueue
From: Jason Wang @ 2019-04-09  9:00 UTC (permalink / raw)
  To: Cornelia Huck, Michael S . Tsirkin
  Cc: Halil Pasic, stable, linux-kernel, kvm, virtualization
In-Reply-To: <20190408123322.24086-1-cohuck@redhat.com>


On 2019/4/8 下午8:33, Cornelia Huck wrote:
> vring_create_virtqueue() allows the caller to specify via the
> may_reduce_num parameter whether the vring code is allowed to
> allocate a smaller ring than specified.
>
> However, the split ring allocation code tries to allocate a
> smaller ring on allocation failure regardless of what the
> caller specified. This may cause trouble for e.g. virtio-pci
> in legacy mode, which does not support ring resizing. (The
> packed ring code does not resize in any case.)
>
> Let's fix this by bailing out immediately in the split ring code
> if the requested size cannot be allocated and may_reduce_num has
> not been specified.
>
> While at it, fix a typo in the usage instructions.
>
> Fixes: 2a2d1382fe9d ("virtio: Add improved queue allocation API")
> Cc: stable@vger.kernel.org # v4.6+
> Signed-off-by: Cornelia Huck <cohuck@redhat.com>
> ---
>   drivers/virtio/virtio_ring.c | 2 ++
>   include/linux/virtio_ring.h  | 2 +-
>   2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 18846afb39da..5df92c308286 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -882,6 +882,8 @@ static struct virtqueue *vring_create_virtqueue_split(
>   					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>   		if (queue)
>   			break;
> +		if (!may_reduce_num)
> +			return NULL;
>   	}
>   
>   	if (!num)
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> index fab02133a919..3dc70adfe5f5 100644
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -63,7 +63,7 @@ struct virtqueue;
>   /*
>    * Creates a virtqueue and allocates the descriptor ring.  If
>    * may_reduce_num is set, then this may allocate a smaller ring than
> - * expected.  The caller should query virtqueue_get_ring_size to learn
> + * expected.  The caller should query virtqueue_get_vring_size to learn
>    * the actual size of the ring.
>    */
>   struct virtqueue *vring_create_virtqueue(unsigned int index,


Acked-by: Jason Wang <jasowang@redhat.com>


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 0/4] vsock/virtio: optimizations to increase the throughput
From: Stefano Garzarella @ 2019-04-09  9:13 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, Michael S. Tsirkin, netdev, linux-kernel, virtualization,
	Stefan Hajnoczi, David S. Miller
In-Reply-To: <a98d194d-e257-41ff-82e7-de2d16d2f4d4@redhat.com>

On Mon, Apr 08, 2019 at 02:43:28PM +0800, Jason Wang wrote:
> 
> On 2019/4/4 下午6:58, Stefano Garzarella wrote:
> > This series tries to increase the throughput of virtio-vsock with slight
> > changes:
> >   - patch 1/4: reduces the number of credit update messages sent to the
> >                transmitter
> >   - patch 2/4: allows the host to split packets on multiple buffers,
> >                in this way, we can remove the packet size limit to
> >                VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE
> >   - patch 3/4: uses VIRTIO_VSOCK_MAX_PKT_BUF_SIZE as the max packet size
> >                allowed
> >   - patch 4/4: increases RX buffer size to 64 KiB (affects only host->guest)
> > 
> > RFC:
> >   - maybe patch 4 can be replaced with multiple queues with different
> >     buffer sizes or using EWMA to adapt the buffer size to the traffic
> 
> 
> Or EWMA + mergeable rx buffer, but if we decide to unify the datapath with
> virtio-net, we can reuse their codes.
> 
> 
> > 
> >   - as Jason suggested in a previous thread [1] I'll evaluate to use
> >     virtio-net as transport, but I need to understand better how to
> >     interface with it, maybe introducing sk_buff in virtio-vsock.
> > 
> > Any suggestions?
> 
> 
> My understanding is this is not a must, but if it makes things easier, we
> can do this.

Hopefully it should simplify the maintainability and avoid duplicated code.

> 
> Another thing that may help is to implement sendpage(), which will greatly
> improve the performance.

Thanks for your suggestions!
I'll try to implement sendpage() in VSOCK to measure the improvement.

Cheers,
Stefano
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH 02/12] virtio/s390: DMA support for virtio-ccw
From: Cornelia Huck @ 2019-04-09  9:57 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190404231622.52531-3-pasic@linux.ibm.com>

On Fri,  5 Apr 2019 01:16:12 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> Currently we have a problem if a virtio-ccw device has
> VIRTIO_F_IOMMU_PLATFORM. 

Can you please describe what the actual problem is?

> In future we do want to support DMA API with
> virtio-ccw.
> 
> Let us do the plumbing, so the feature VIRTIO_F_IOMMU_PLATFORM works
> with virtio-ccw.
> 
> Let us also switch from legacy avail/used accessors to the DMA aware
> ones (even if it isn't strictly necessary).

I think with this change we can remove the legacy accessors, if I
didn't mis-grep.

> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> ---
>  drivers/s390/virtio/virtio_ccw.c | 23 +++++++++++++++++------
>  1 file changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
> index edf4afe2d688..5956c9e820bb 100644
> --- a/drivers/s390/virtio/virtio_ccw.c
> +++ b/drivers/s390/virtio/virtio_ccw.c
> @@ -66,6 +66,7 @@ struct virtio_ccw_device {
>  	bool device_lost;
>  	unsigned int config_ready;
>  	void *airq_info;
> +	__u64 dma_mask;

u64?

>  };
>  
>  struct vq_info_block_legacy {
> @@ -536,8 +537,8 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
>  		info->info_block->s.desc = queue;
>  		info->info_block->s.index = i;
>  		info->info_block->s.num = info->num;
> -		info->info_block->s.avail = (__u64)virtqueue_get_avail(vq);
> -		info->info_block->s.used = (__u64)virtqueue_get_used(vq);
> +		info->info_block->s.avail = (__u64)virtqueue_get_avail_addr(vq);
> +		info->info_block->s.used = (__u64)virtqueue_get_used_addr(vq);
>  		ccw->count = sizeof(info->info_block->s);
>  	}
>  	ccw->cmd_code = CCW_CMD_SET_VQ;
> @@ -769,10 +770,8 @@ static u64 virtio_ccw_get_features(struct virtio_device *vdev)
>  static void ccw_transport_features(struct virtio_device *vdev)
>  {
>  	/*
> -	 * Packed ring isn't enabled on virtio_ccw for now,
> -	 * because virtio_ccw uses some legacy accessors,
> -	 * e.g. virtqueue_get_avail() and virtqueue_get_used()
> -	 * which aren't available in packed ring currently.
> +	 * There shouldn't be anything that precludes supporting paced.

s/paced/packed/

> +	 * TODO: Remove the limitation after having another look into this.
>  	 */
>  	__virtio_clear_bit(vdev, VIRTIO_F_RING_PACKED);
>  }
> @@ -1255,6 +1254,18 @@ static int virtio_ccw_online(struct ccw_device *cdev)
>  		ret = -ENOMEM;
>  		goto out_free;
>  	}
> +	vcdev->vdev.dev.parent = &cdev->dev;

That one makes sense, pci and mmio are doing that as well.

> +	cdev->dev.dma_mask = &vcdev->dma_mask;

That one feels a bit weird. Will this change in one of the follow-on
patches? (Have not yet looked at the whole series.)

> +
> +	ret = dma_set_mask_and_coherent(&cdev->dev, DMA_BIT_MASK(64));
> +	if (ret)
> +		ret = dma_set_mask_and_coherent(&cdev->dev,
> +						DMA_BIT_MASK(32));
> +	if (ret) {
> +		dev_warn(&cdev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");

This does not look like you'd try to continue?

> +		goto out_free;
> +	}
> +
>  	vcdev->config_block = kzalloc(sizeof(*vcdev->config_block),
>  				   GFP_DMA | GFP_KERNEL);
>  	if (!vcdev->config_block) {

^ permalink raw reply

* Re: [RFC PATCH 03/12] s390/mm: force swiotlb for protected virtualization
From: Cornelia Huck @ 2019-04-09 10:16 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190404231622.52531-4-pasic@linux.ibm.com>

On Fri,  5 Apr 2019 01:16:13 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On s390 protected virtualization guests also have to use bounce I/O
> buffers.  That requires some plumbing.
> 
> Let us make sure any device using DMA API accordingly is spared from the
> problems that hypervisor attempting I/O to a non-shared secure page would
> bring.

I have problems parsing this sentence :(

Do you mean that we want to exclude pages for I/O from encryption?

> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> ---
>  arch/s390/Kconfig                   |  4 ++++
>  arch/s390/include/asm/Kbuild        |  1 -
>  arch/s390/include/asm/dma-mapping.h | 13 +++++++++++
>  arch/s390/include/asm/mem_encrypt.h | 18 +++++++++++++++
>  arch/s390/mm/init.c                 | 44 +++++++++++++++++++++++++++++++++++++
>  5 files changed, 79 insertions(+), 1 deletion(-)
>  create mode 100644 arch/s390/include/asm/dma-mapping.h
>  create mode 100644 arch/s390/include/asm/mem_encrypt.h

(...)

> @@ -126,6 +129,45 @@ void mark_rodata_ro(void)
>  	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
>  }
>  
> +int set_memory_encrypted(unsigned long addr, int numpages)
> +{
> +	/* also called for the swiotlb bounce buffers, make all pages shared */
> +	/* TODO: do ultravisor calls */
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(set_memory_encrypted);
> +
> +int set_memory_decrypted(unsigned long addr, int numpages)
> +{
> +	/* also called for the swiotlb bounce buffers, make all pages shared */
> +	/* TODO: do ultravisor calls */
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(set_memory_decrypted);
> +
> +/* are we a protected virtualization guest? */
> +bool sev_active(void)
> +{
> +	/*
> +	 * TODO: Do proper detection using ultravisor, for now let us fake we
> +	 *  have it so the code gets exercised.

That's the swiotlb stuff, right?

(The patches will obviously need some reordering before it is actually
getting merged.)

> +	 */
> +	return true;
> +}
> +EXPORT_SYMBOL_GPL(sev_active);
> +
> +/* protected virtualization */
> +static void pv_init(void)
> +{
> +	if (!sev_active())
> +		return;
> +
> +	/* make sure bounce buffers are shared */
> +	swiotlb_init(1);
> +	swiotlb_update_mem_attributes();
> +	swiotlb_force = SWIOTLB_FORCE;
> +}
> +
>  void __init mem_init(void)
>  {
>  	cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
> @@ -134,6 +176,8 @@ void __init mem_init(void)
>  	set_max_mapnr(max_low_pfn);
>          high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
>  
> +	pv_init();
> +
>  	/* Setup guest page hinting */
>  	cmma_init();
>  

^ permalink raw reply

* Re: [RFC PATCH 04/12] s390/cio: introduce cio DMA pool
From: Cornelia Huck @ 2019-04-09 10:44 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190404231622.52531-5-pasic@linux.ibm.com>

On Fri,  5 Apr 2019 01:16:14 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> To support protected virtualization cio will need to make sure the
> memory used for communication with the hypervisor is DMA memory.
> 
> Let us introduce a DMA pool to cio that will help us in allocating

missing 'and'

> deallocating those chunks of memory.
> 
> We use a gen_pool backed with DMA pages to avoid each allocation
> effectively wasting a page, as we typically allocate much less
> than PAGE_SIZE.
> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> ---
>  arch/s390/Kconfig           |  1 +
>  arch/s390/include/asm/cio.h |  3 +++
>  drivers/s390/cio/css.c      | 57 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 61 insertions(+)
> 
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 46c69283a67b..e8099ab47368 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -194,6 +194,7 @@ config S390
>  	select VIRT_TO_BUS
>  	select HAVE_NMI
>  	select SWIOTLB
> +	select GENERIC_ALLOCATOR
>  
>  
>  config SCHED_OMIT_FRAME_POINTER
> diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
> index 1727180e8ca1..4510e418614a 100644
> --- a/arch/s390/include/asm/cio.h
> +++ b/arch/s390/include/asm/cio.h
> @@ -328,6 +328,9 @@ static inline u8 pathmask_to_pos(u8 mask)
>  void channel_subsystem_reinit(void);
>  extern void css_schedule_reprobe(void);
>  
> +extern void *cio_dma_zalloc(size_t size);
> +extern void cio_dma_free(void *cpu_addr, size_t size);
> +
>  /* Function from drivers/s390/cio/chsc.c */
>  int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
>  int chsc_sstpi(void *page, void *result, size_t size);
> diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
> index aea502922646..72629d99d8e4 100644
> --- a/drivers/s390/cio/css.c
> +++ b/drivers/s390/cio/css.c
> @@ -20,6 +20,8 @@
>  #include <linux/reboot.h>
>  #include <linux/suspend.h>
>  #include <linux/proc_fs.h>
> +#include <linux/genalloc.h>
> +#include <linux/dma-mapping.h>
>  #include <asm/isc.h>
>  #include <asm/crw.h>
>  
> @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = {
>  	NULL,
>  };
>  
> +static u64 css_dev_dma_mask = DMA_BIT_MASK(31);
> +
>  static int __init setup_css(int nr)
>  {
>  	struct channel_subsystem *css;
> @@ -899,6 +903,9 @@ static int __init setup_css(int nr)
>  	dev_set_name(&css->device, "css%x", nr);
>  	css->device.groups = cssdev_attr_groups;
>  	css->device.release = channel_subsystem_release;
> +	/* some cio DMA memory needs to be 31 bit addressable */
> +	css->device.coherent_dma_mask = DMA_BIT_MASK(31),
> +	css->device.dma_mask = &css_dev_dma_mask;

Question: Does this percolate down to the child devices eventually?
E.g., you have a ccw_device getting the mask from its parent
subchannel, which gets it from its parent css? If so, does that clash
with the the mask you used for the virtio_ccw_device in a previous
patch? Or are they two really different things?

>  
>  	mutex_init(&css->mutex);
>  	css->cssid = chsc_get_cssid(nr);
> @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = {
>  	.notifier_call = css_power_event,
>  };
>  
> +#define POOL_INIT_PAGES 1
> +static struct gen_pool *cio_dma_pool;
> +/* Currently cio supports only a single css */
> +static struct device *cio_dma_css;

That global variable feels wrong, especially if you plan to support
MCSS-E in the future. (Do you? :) If yes, should the dma pool be global
or per-css? As css0 currently is the root device for the channel
subsystem stuff, you'd either need a new parent to hang this off from
or size this with the number of css images.)

For now, just grab channel_subsystems[0]->device directly?

> +static gfp_t  cio_dma_flags;
> +
> +static void __init cio_dma_pool_init(void)
> +{
> +	void *cpu_addr;
> +	dma_addr_t dma_addr;
> +	int i;
> +
> +	cio_dma_css = &channel_subsystems[0]->device;
> +	cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO;
> +	cio_dma_pool = gen_pool_create(3, -1);
> +	/* No need to free up the resources: compiled in */
> +	for (i = 0; i < POOL_INIT_PAGES; ++i) {
> +		cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr,
> +					      cio_dma_flags);
> +		if (!cpu_addr)
> +			return;
> +		gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr,
> +				  dma_addr, PAGE_SIZE, -1);
> +	}
> +
> +}
> +
> +void *cio_dma_zalloc(size_t size)
> +{
> +	dma_addr_t dma_addr;
> +	unsigned long addr = gen_pool_alloc(cio_dma_pool, size);
> +
> +	if (!addr) {
> +		addr = (unsigned long) dma_alloc_coherent(cio_dma_css,
> +					PAGE_SIZE, &dma_addr, cio_dma_flags);
> +		if (!addr)
> +			return NULL;
> +		gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1);
> +		addr = gen_pool_alloc(cio_dma_pool, size);
> +	}
> +	return (void *) addr;

At this point, you're always going via the css0 device. I'm wondering
whether you should pass in the cssid here and use css_by_id(cssid) to
make this future proof. But then, I'm not quite clear from which
context this will be called.

> +}
> +
> +void cio_dma_free(void *cpu_addr, size_t size)
> +{
> +	memset(cpu_addr, 0, size);
> +	gen_pool_free(cio_dma_pool, (unsigned long) cpu_addr, size);
> +}
> +
>  /*
>   * Now that the driver core is running, we can setup our channel subsystem.
>   * The struct subchannel's are created during probing.
> @@ -1063,6 +1119,7 @@ static int __init css_bus_init(void)
>  		unregister_reboot_notifier(&css_reboot_notifier);
>  		goto out_unregister;
>  	}
> +	cio_dma_pool_init();
>  	css_init_done = 1;
>  
>  	/* Enable default isc for I/O subchannels. */

^ permalink raw reply

* [PATCH 4/4] drm: add convert_lines_toio() variant, fix cirrus builds on powerpc.
From: Gerd Hoffmann @ 2019-04-09 11:59 UTC (permalink / raw)
  To: dri-devel
  Cc: Stephen Rothwell, David Airlie, Sean Paul, Maarten Lankhorst,
	open list, open list:DRM DRIVER FOR QEMU'S CIRRUS DEVICE,
	Maxime Ripard, noralf, Daniel Vetter, Dave Airlie, sam
In-Reply-To: <20190409115913.3468-1-kraxel@redhat.com>

The __io_virt() macro is not available on all architectures, so cirrus
can't simply pass a pointer to io memory down to the format conversion
helpers.  The format conversion helpers must use memcpy_toio() instead.

Add a convert_lines_toio() variant which does just that.  Switch the
drm_fb_*_dstclip() functions used by cirrus to accept a __iomem pointer
as destination and pass that down to convert_lines_toio().  Fix the
calls in the cirrus driver to not use __io_virt() any more.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 include/drm/drm_format_helper.h     |  9 ++--
 drivers/gpu/drm/cirrus/cirrus.c     |  6 +--
 drivers/gpu/drm/drm_format_helper.c | 74 ++++++++++++++++++++++-------
 3 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/include/drm/drm_format_helper.h b/include/drm/drm_format_helper.h
index 6f84380757ee..3532b76c2340 100644
--- a/include/drm/drm_format_helper.h
+++ b/include/drm/drm_format_helper.h
@@ -15,17 +15,20 @@ struct drm_rect;
 
 void drm_fb_memcpy(void *dst, void *vaddr, struct drm_framebuffer *fb,
 		   struct drm_rect *clip);
-void drm_fb_memcpy_dstclip(void *dst, void *vaddr, struct drm_framebuffer *fb,
+void drm_fb_memcpy_dstclip(void __iomem *dst, void *vaddr,
+			   struct drm_framebuffer *fb,
 			   struct drm_rect *clip);
 void drm_fb_swab16(u16 *dst, void *vaddr, struct drm_framebuffer *fb,
 		   struct drm_rect *clip);
 void drm_fb_xrgb8888_to_rgb565(void *dst, void *vaddr,
 			       struct drm_framebuffer *fb,
 			       struct drm_rect *clip, bool swap);
-void drm_fb_xrgb8888_to_rgb565_dstclip(void *dst, unsigned int dst_pitch,
+void drm_fb_xrgb8888_to_rgb565_dstclip(void __iomem *dst,
+				       unsigned int dst_pitch,
 				       void *vaddr, struct drm_framebuffer *fb,
 				       struct drm_rect *clip, bool swap);
-void drm_fb_xrgb8888_to_rgb888_dstclip(void *dst, unsigned int dst_pitch,
+void drm_fb_xrgb8888_to_rgb888_dstclip(void __iomem *dst,
+				       unsigned int dst_pitch,
 				       void *vaddr, struct drm_framebuffer *fb,
 				       struct drm_rect *clip);
 void drm_fb_xrgb8888_to_gray8(u8 *dst, void *vaddr, struct drm_framebuffer *fb,
diff --git a/drivers/gpu/drm/cirrus/cirrus.c b/drivers/gpu/drm/cirrus/cirrus.c
index 5095b8ce52c2..be4ea370ba31 100644
--- a/drivers/gpu/drm/cirrus/cirrus.c
+++ b/drivers/gpu/drm/cirrus/cirrus.c
@@ -307,16 +307,16 @@ static int cirrus_fb_blit_rect(struct drm_framebuffer *fb,
 		return -ENOMEM;
 
 	if (cirrus->cpp == fb->format->cpp[0])
-		drm_fb_memcpy_dstclip(__io_virt(cirrus->vram),
+		drm_fb_memcpy_dstclip(cirrus->vram,
 				      vmap, fb, rect);
 
 	else if (fb->format->cpp[0] == 4 && cirrus->cpp == 2)
-		drm_fb_xrgb8888_to_rgb565_dstclip(__io_virt(cirrus->vram),
+		drm_fb_xrgb8888_to_rgb565_dstclip(cirrus->vram,
 						  cirrus->pitch,
 						  vmap, fb, rect, false);
 
 	else if (fb->format->cpp[0] == 4 && cirrus->cpp == 3)
-		drm_fb_xrgb8888_to_rgb888_dstclip(__io_virt(cirrus->vram),
+		drm_fb_xrgb8888_to_rgb888_dstclip(cirrus->vram,
 						  cirrus->pitch,
 						  vmap, fb, rect);
 
diff --git a/drivers/gpu/drm/drm_format_helper.c b/drivers/gpu/drm/drm_format_helper.c
index 01d9ea134618..6a759cbaa263 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -11,6 +11,8 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
+#include <asm/io.h>
+
 #include <drm/drm_format_helper.h>
 #include <drm/drm_framebuffer.h>
 #include <drm/drm_fourcc.h>
@@ -125,6 +127,30 @@ static void convert_lines(void *dst, unsigned int dst_pitch,
 	kfree(sbuf);
 }
 
+static void convert_lines_toio(void __iomem *dst, unsigned int dst_pitch,
+			       void *src, unsigned int src_pitch,
+			       unsigned int pixels,
+			       unsigned int lines,
+			       struct drm_format_convert *conv)
+{
+	u32 dst_linelength = pixels * conv->dst_cpp;
+	u32 y;
+	void *dbuf;
+
+	dbuf = kmalloc(dst_linelength, GFP_KERNEL);
+	if (!dbuf)
+		return;
+
+	for (y = 0; y < lines; y++) {
+		conv->func(dbuf, src, pixels);
+		memcpy_toio(dst, dbuf, dst_linelength);
+		src += src_pitch;
+		dst += dst_pitch;
+	}
+
+	kfree(dbuf);
+}
+
 static u32 clip_offset(struct drm_rect *clip, u32 pitch, u32 cpp)
 {
 	return (clip->y1 * pitch) + (clip->x1 * cpp);
@@ -143,6 +169,19 @@ static void drm_fb_memcpy_lines(void *dst, unsigned int dst_pitch,
 	}
 }
 
+static void drm_fb_memcpy_lines_toio(void __iomem *dst, unsigned int dst_pitch,
+				     void *src, unsigned int src_pitch,
+				     unsigned int linelength, unsigned int lines)
+{
+	int line;
+
+	for (line = 0; line < lines; line++) {
+		memcpy_toio(dst, src, linelength);
+		src += src_pitch;
+		dst += dst_pitch;
+	}
+}
+
 /**
  * drm_fb_memcpy - Copy clip buffer
  * @dst: Destination buffer
@@ -176,16 +215,17 @@ EXPORT_SYMBOL(drm_fb_memcpy);
  * This function applies clipping on dst, i.e. the destination is a
  * full framebuffer but only the clip rect content is copied over.
  */
-void drm_fb_memcpy_dstclip(void *dst, void *vaddr, struct drm_framebuffer *fb,
+void drm_fb_memcpy_dstclip(void __iomem *dst, void *vaddr,
+			   struct drm_framebuffer *fb,
 			   struct drm_rect *clip)
 {
 	unsigned int cpp = drm_format_plane_cpp(fb->format->format, 0);
 	unsigned int offset = (clip->y1 * fb->pitches[0]) + (clip->x1 * cpp);
 	size_t len = (clip->x2 - clip->x1) * cpp;
 
-	drm_fb_memcpy_lines(dst + offset, fb->pitches[0],
-			    vaddr + offset, fb->pitches[0],
-			    len, clip->y2 - clip->y1);
+	drm_fb_memcpy_lines_toio(dst + offset, fb->pitches[0],
+				 vaddr + offset, fb->pitches[0],
+				 len, clip->y2 - clip->y1);
 }
 EXPORT_SYMBOL(drm_fb_memcpy_dstclip);
 
@@ -245,7 +285,7 @@ EXPORT_SYMBOL(drm_fb_xrgb8888_to_rgb565);
 
 /**
  * drm_fb_xrgb8888_to_rgb565_dstclip - Convert XRGB8888 to RGB565 clip buffer
- * @dst: RGB565 destination buffer
+ * @dst: RGB565 destination buffer (__iomem)
  * @dst_pitch: destination buffer pitch
  * @vaddr: XRGB8888 source buffer
  * @fb: DRM framebuffer
@@ -256,9 +296,10 @@ EXPORT_SYMBOL(drm_fb_xrgb8888_to_rgb565);
  * support XRGB8888.
  *
  * This function applies clipping on dst, i.e. the destination is a
- * full framebuffer but only the clip rect content is copied over.
+ * full framebuffer, in iomem, but only the clip rect content is copied over.
  */
-void drm_fb_xrgb8888_to_rgb565_dstclip(void *dst, unsigned int dst_pitch,
+void drm_fb_xrgb8888_to_rgb565_dstclip(void __iomem *dst,
+				       unsigned int dst_pitch,
 				       void *vaddr, struct drm_framebuffer *fb,
 				       struct drm_rect *clip, bool swap)
 {
@@ -272,15 +313,15 @@ void drm_fb_xrgb8888_to_rgb565_dstclip(void *dst, unsigned int dst_pitch,
 	size_t pixels = (clip->x2 - clip->x1);
 	size_t lines = (clip->y2 - clip->y1);
 
-	convert_lines(dst + dst_offset, dst_pitch,
-		      vaddr + src_offset, fb->pitches[0],
-		      pixels, lines, conv);
+	convert_lines_toio(dst + dst_offset, dst_pitch,
+			   vaddr + src_offset, fb->pitches[0],
+			   pixels, lines, conv);
 }
 EXPORT_SYMBOL(drm_fb_xrgb8888_to_rgb565_dstclip);
 
 /**
  * drm_fb_xrgb8888_to_rgb888_dstclip - Convert XRGB8888 to RGB888 clip buffer
- * @dst: RGB565 destination buffer
+ * @dst: RGB888 destination buffer (__iomem)
  * @dst_pitch: destination buffer pitch
  * @vaddr: XRGB8888 source buffer
  * @fb: DRM framebuffer
@@ -291,9 +332,10 @@ EXPORT_SYMBOL(drm_fb_xrgb8888_to_rgb565_dstclip);
  * support XRGB8888.
  *
  * This function applies clipping on dst, i.e. the destination is a
- * full framebuffer but only the clip rect content is copied over.
+ * full framebuffer, in iomem, but only the clip rect content is copied over.
  */
-void drm_fb_xrgb8888_to_rgb888_dstclip(void *dst, unsigned int dst_pitch,
+void drm_fb_xrgb8888_to_rgb888_dstclip(void __iomem *dst,
+				       unsigned int dst_pitch,
 				       void *vaddr, struct drm_framebuffer *fb,
 				       struct drm_rect *clip)
 {
@@ -305,9 +347,9 @@ void drm_fb_xrgb8888_to_rgb888_dstclip(void *dst, unsigned int dst_pitch,
 	size_t pixels = (clip->x2 - clip->x1);
 	size_t lines = (clip->y2 - clip->y1);
 
-	convert_lines(dst + dst_offset, dst_pitch,
-		      vaddr + src_offset, fb->pitches[0],
-		      pixels, lines, conv);
+	convert_lines_toio(dst + dst_offset, dst_pitch,
+			   vaddr + src_offset, fb->pitches[0],
+			   pixels, lines, conv);
 }
 EXPORT_SYMBOL(drm_fb_xrgb8888_to_rgb888_dstclip);
 
-- 
2.18.1

^ permalink raw reply related

* Re: [RFC PATCH 03/12] s390/mm: force swiotlb for protected virtualization
From: Christoph Hellwig @ 2019-04-09 12:22 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Cornelia Huck, virtualization, Martin Schwidefsky,
	Farhan Ali, Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190404231622.52531-4-pasic@linux.ibm.com>

> +++ b/arch/s390/include/asm/dma-mapping.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_S390_DMA_MAPPING_H
> +#define _ASM_S390_DMA_MAPPING_H
> +
> +#include <linux/dma-contiguous.h>
> +
> +static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
> +{
> +	return NULL;
> +}
> +
> +#endif /* _ASM_S390_DMA_MAPPING_H */
> +

Congratulations!  You ust create an entirely pointless duplicate of
include/asm-generic/dma-mapping.h.

^ permalink raw reply

* Re: [PATCH net] vhost: reject zero size iova range
From: Michael S. Tsirkin @ 2019-04-09 12:50 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20190409041025.20922-1-jasowang@redhat.com>

On Tue, Apr 09, 2019 at 12:10:25PM +0800, Jason Wang wrote:
> We used to accept zero size iova range which will lead a infinite loop
> in translate_desc(). Fixing this by failing the request in this case.
> 
> Reported-by: syzbot+d21e6e297322a900c128@syzkaller.appspotmail.com
> Fixes: 6b1e6cc7 ("vhost: new device IOTLB API")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

Seems appropriate for stable.

> ---
>  drivers/vhost/vhost.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5ace833de746..351af88231ad 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -911,8 +911,12 @@ static int vhost_new_umem_range(struct vhost_umem *umem,
>  				u64 start, u64 size, u64 end,
>  				u64 userspace_addr, int perm)
>  {
> -	struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC);
> +	struct vhost_umem_node *tmp, *node;
>  
> +	if (!size)
> +		return -EFAULT;
> +
> +	node = kmalloc(sizeof(*node), GFP_ATOMIC);
>  	if (!node)
>  		return -ENOMEM;
>  
> -- 
> 2.19.1

^ permalink raw reply

* Re: [RFC PATCH 02/12] virtio/s390: DMA support for virtio-ccw
From: Cornelia Huck @ 2019-04-09 13:01 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190409132927.5df3bc50@oc2783563651>

On Tue, 9 Apr 2019 13:29:27 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 9 Apr 2019 11:57:43 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Fri,  5 Apr 2019 01:16:12 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > Currently we have a problem if a virtio-ccw device has
> > > VIRTIO_F_IOMMU_PLATFORM.   
> > 
> > Can you please describe what the actual problem is?
> >   
> 
> Without this patch:
> 
> WARNING: CPU: 2 PID: 26
> at [..]/kernel/dma/mapping.c:251
> dma_alloc_attrs+0x8e/0xd0 Modules linked in: CPU: 2 PID: 26 Comm:
> kworker/u6:1 Not tainted 5.1.0-rc3-00023-g1ec89ec #596 Hardware name:
> IBM 2964 NC9 712 (KVM/Linux) Workqueue: events_unbound async_run_entry_fn
> Krnl PSW : 0704c00180000000 000000000021b18e (dma_alloc_attrs+0x8e/0xd0)
>            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> Krnl GPRS: 0000000000000000 0000000000000000 0000000000000000
> 0000000000001406 000003e00040f838 0000000000002dc0 0000000000000100
> 0000000000000001 0000000000001000 000000000236f028 000003e00040f838
> 0000000000001406 000000004b289828 0000000000000080 000003e00040f6f8
> 000003e00040f6a0 Krnl Code: 000000000021b17e: f0e00004ebaf
> srp	4(15,%r0),2991(%r14),0 000000000021b184: f0c0000407f4
> srp	4(13,%r0),2036,0 #000000000021b18a: a7f40001
> brc	15,21b18c >000000000021b18e: ec5520bc0055	risbg
> %r5,%r5,32,188,0 000000000021b194: b9020011		ltgr
> %r1,%r1 000000000021b198: a784ffd9		brc	8,21b14a
>            000000000021b19c: e31010000002	ltg	%r1,0(%r1)
>            000000000021b1a2: a7840012		brc	8,21b1c6
> Call Trace:
> ([<0000000000000004>] 0x4)
>  [<00000000007a7d54>] vring_alloc_queue+0x74/0x90 
>  [<00000000007a8390>] vring_create_virtqueue+0xf8/0x288 
>  [<0000000000919ec0>] virtio_ccw_find_vqs+0xf8/0x950 
>  [<000000000080772e>] init_vq+0x16e/0x318 
>  [<00000000008087c4>] virtblk_probe+0xf4/0xb58 
>  [<00000000007a62a6>] virtio_dev_probe+0x1a6/0x250 
>  [<00000000007ea498>] really_probe+0x1c8/0x290 
>  [<00000000007ea746>] driver_probe_device+0x86/0x160 
>  [<00000000007e7cba>] bus_for_each_drv+0x7a/0xc0 
>  [<00000000007ea23c>] __device_attach+0xfc/0x180 
>  [<00000000007e9116>] bus_probe_device+0xae/0xc8 
>  [<00000000007e5066>] device_add+0x3fe/0x698 
>  [<00000000007a5d92>] register_virtio_device+0xca/0x120 
>  [<00000000009195a2>] virtio_ccw_online+0x1b2/0x220 
>  [<000000000089853e>] ccw_device_set_online+0x1d6/0x4d8 
>  [<0000000000918cf6>] virtio_ccw_auto_online+0x26/0x58 
>  [<00000000001a61b6>] async_run_entry_fn+0x5e/0x158 
>  [<0000000000199322>] process_one_work+0x25a/0x668 
>  [<000000000019977a>] worker_thread+0x4a/0x428 
>  [<00000000001a1ae8>] kthread+0x150/0x170 
>  [<0000000000aeab3a>] kernel_thread_starter+0x6/0xc 
>  [<0000000000aeab34>] kernel_thread_starter+0x0/0xc 
> 
> [..]
> 
> virtio_ccw 0.0.0301: no vq
> ---[ end trace d35815958c12cad3 ]---
> virtio_ccw 0.0.0300: no vq
> virtio_blk: probe of virtio1 failed with error -12
> virtio_blk: probe of virtio3 failed with error -12
> 
> Means virtio devices broken.
> 
> Should I
> s/we have a problem if a virtio-ccw device/virtio-ccw devices do not work if the device/
> ?

Much better :)

(That is, this happens if we switch on the feature bit in the
hypervisor, right?)

> 
> > > In future we do want to support DMA API with
> > > virtio-ccw.
> > > 
> > > Let us do the plumbing, so the feature VIRTIO_F_IOMMU_PLATFORM works
> > > with virtio-ccw.
> > > 
> > > Let us also switch from legacy avail/used accessors to the DMA aware
> > > ones (even if it isn't strictly necessary).  
> > 
> > I think with this change we can remove the legacy accessors, if I
> > didn't mis-grep.
> >   
> 
> That is possible, I can do that in v1.

Sounds good.

> 
> > > 
> > > Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> > > ---
> > >  drivers/s390/virtio/virtio_ccw.c | 23 +++++++++++++++++------
> > >  1 file changed, 17 insertions(+), 6 deletions(-)

(...)

> > > @@ -1255,6 +1254,18 @@ static int virtio_ccw_online(struct ccw_device *cdev)
> > >  		ret = -ENOMEM;
> > >  		goto out_free;
> > >  	}
> > > +	vcdev->vdev.dev.parent = &cdev->dev;  
> > 
> > That one makes sense, pci and mmio are doing that as well.
> >   
> > > +	cdev->dev.dma_mask = &vcdev->dma_mask;  
> > 
> > That one feels a bit weird. Will this change in one of the follow-on
> > patches? (Have not yet looked at the whole series.)  
> 
> I don't thinks so. Do you mean this should happen within the cio code?
> I think I started out with the idea to keep the scope as narrow as
> possible. Do you have any suggestions?

From what I see, you set the mask from the virtio-ccw side, then
propagate it up to the general ccw_device, and then the generic virtio
code will fetch it from the ccw_device. Don't you potentially need
something for other ccw_devices in that protected hipervisor case as
well (e.g for 3270)?

> 
> >   
> > > +
> > > +	ret = dma_set_mask_and_coherent(&cdev->dev, DMA_BIT_MASK(64));
> > > +	if (ret)
> > > +		ret = dma_set_mask_and_coherent(&cdev->dev,
> > > +						DMA_BIT_MASK(32));
> > > +	if (ret) {
> > > +		dev_warn(&cdev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");  
> > 
> > This does not look like you'd try to continue?
> >   
> 
> I remember now. First I did continue, then I changed this to fail hard
> so I can not ignore any such problems while smoke testing ('I don't always
> check the kernel messages'), but kept the old message. This basically
> should not fail anyway, otherwise we have a problem AFAIU.
> 
> By the way virtio-pci tries to continue indeed, and this is also where
> the wording comes from ;).
> 
> What would you prefer? Try to continue or fail right away?

If it does not have a chance of working properly in the general case,
I'd fail.

> 
> Regards,
> Halil
> 
> > > +		goto out_free;
> > > +	}
> > > +
> > >  	vcdev->config_block = kzalloc(sizeof(*vcdev->config_block),
> > >  				   GFP_DMA | GFP_KERNEL);
> > >  	if (!vcdev->config_block) {  
> >   
> 

^ permalink raw reply

* Re: [PATCH net] vhost: flush dcache page when logging dirty pages
From: Michael S. Tsirkin @ 2019-04-09 13:14 UTC (permalink / raw)
  To: Jason Wang
  Cc: Andrea Arcangeli, James Bottomley, kvm, netdev, linux-kernel,
	virtualization, Christoph Hellwig
In-Reply-To: <20190409041647.21269-1-jasowang@redhat.com>

On Tue, Apr 09, 2019 at 12:16:47PM +0800, Jason Wang wrote:
> We set dirty bit through setting up kmaps and access them through
> kernel virtual address, this may result alias in virtually tagged
> caches that require a dcache flush afterwards.
> 
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")

This is like saying "everyone with vhost needs this".
In practice only might affect some architectures.
Which ones? You want to Cc the relevant maintainers
who understand this...

> Signed-off-by: Jason Wang <jasowang@redhat.com>

I am not sure this is a good idea.
The region in question is supposed to be accessed
by userspace at the same time, through atomic operations.

How do we know userspace didn't access it just before?

Is that an issue at all given we use
atomics for access? Documentation/core-api/cachetlb.rst does
not mention atomics.
Which architectures are affected?
Assuming atomics actually do need a flush, then don't we need
a flush in the other direction too? How are atomics
supposed to work at all?


I really think we need new APIs along the lines of
set_bit_to_user.

> ---
>  drivers/vhost/vhost.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 351af88231ad..34a1cedbc5ba 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1711,6 +1711,7 @@ static int set_bit_to_user(int nr, void __user *addr)
>  	base = kmap_atomic(page);
>  	set_bit(bit, base);
>  	kunmap_atomic(base);
> +	flush_dcache_page(page);
>  	set_page_dirty_lock(page);
>  	put_page(page);
>  	return 0;

Ignoring the question of whether this actually helps, I doubt
flush_dcache_page is appropriate here.  Pls take a look at
Documentation/core-api/cachetlb.rst as well as the actual
implementation.

I think you meant flush_kernel_dcache_page, and IIUC it must happen
before kunmap, not after (which you still have the va locked).

> -- 
> 2.19.1

^ permalink raw reply

* Re: [PATCH 00/15] Share TTM code among framebuffer drivers
From: Koenig, Christian @ 2019-04-09 13:39 UTC (permalink / raw)
  To: Thomas Zimmermann, daniel@ffwll.ch, airlied@linux.ie,
	kraxel@redhat.com, Huang, Ray, Zhang, Jerry, hdegoede@redhat.com,
	z.liuxinliang@hisilicon.com, zourongrong@gmail.com,
	kong.kongxinwei@hisilicon.com, puck.chen@hisilicon.com
  Cc: dri-devel@lists.freedesktop.org,
	virtualization@lists.linux-foundation.org
In-Reply-To: <096a70a7-ed24-2161-29e9-1907221b8a64@suse.de>

Am 08.04.19 um 13:59 schrieb Thomas Zimmermann:
[SNIP]
> If not for TTM, what would be the alternative? One VMA manager per
> memory region per device?

Since everybody vital seems to be on this mail thread anyway, let's use 
it a bit for brain storming what a possible replacement for TTM should 
look like.

Well for simple drivers like qemu/bochs and cirrus the answer is to not 
use it at all. E.g. VRAM is only used for scanout and unprivileged 
userspace should not mess with it at all. In this case we don't need 
dynamic eviction and so also don't need TTM.

That leaves us with the more complex drivers, like radeon, amdgpu, 
nouveu and maybe some of the ARM based stuff, with vmwgfx being a bit 
special here.

Now I can summarize the requirements for at least the amdgpu in the 
following way:
1. We need to be able to allocate memory objects in different locations.
2. We need to be able to move around memory objects between different 
locations.
3. We need some LRU component which tells us what to evict when memory 
in a location becomes to tight.

Now for lessons learned we should at least avoid the following design 
pitfalls:
A) DON'T make it a layered design! Layers are for making cake, not software.

B) DON'T make it a "Eierlegende Wollmilchsau" (German saying). E.g. 
don't try to solve every single corner cases in one piece of software.
     Let's make components which solve one specific problem.

C) Pipeline everything! E.g. the hardware we deal with is asynchronous 
by design. Blocking for the hardware to finish in the common components 
itself is an absolutely no-go.
     If a driver wants to do something synchronous it should wait itself.

Those comments where not really intended for you Thomas, but I had to 
write them down somewhere :)

Regards,
Christian.

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC PATCH 02/12] virtio/s390: DMA support for virtio-ccw
From: Cornelia Huck @ 2019-04-09 15:47 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190409152313.0296e8f1@oc2783563651>

On Tue, 9 Apr 2019 15:23:13 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 9 Apr 2019 15:01:20 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Tue, 9 Apr 2019 13:29:27 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > On Tue, 9 Apr 2019 11:57:43 +0200
> > > Cornelia Huck <cohuck@redhat.com> wrote:
> > >   
> > > > On Fri,  5 Apr 2019 01:16:12 +0200
> > > > Halil Pasic <pasic@linux.ibm.com> wrote:

> > > > > @@ -1255,6 +1254,18 @@ static int virtio_ccw_online(struct ccw_device *cdev)
> > > > >  		ret = -ENOMEM;
> > > > >  		goto out_free;
> > > > >  	}
> > > > > +	vcdev->vdev.dev.parent = &cdev->dev;    
> > > > 
> > > > That one makes sense, pci and mmio are doing that as well.
> > > >     
> > > > > +	cdev->dev.dma_mask = &vcdev->dma_mask;    
> > > > 
> > > > That one feels a bit weird. Will this change in one of the follow-on
> > > > patches? (Have not yet looked at the whole series.)    
> > > 
> > > I don't thinks so. Do you mean this should happen within the cio code?
> > > I think I started out with the idea to keep the scope as narrow as
> > > possible. Do you have any suggestions?  
> > 
> > From what I see, you set the mask from the virtio-ccw side, then
> > propagate it up to the general ccw_device, and then the generic virtio
> > code will fetch it from the ccw_device.  
> 
> Right! For some reason dma_mask is a pointer. And I need virtio core to
> use a sane value for virtio_ccw devices.
> 
> > Don't you potentially need
> > something for other ccw_devices in that protected hipervisor case as
> > well (e.g for 3270)?  
> 
> 
> Maybe, maybe not. The first stage is likely to be virito only. I would
> prefer sorting out stuff like 3270 as the need arises. Also see my
> response to patch 4 (Message-Id: <20190409141114.7dcce94a@oc2783563651>).

As long as the infrastructure is flexible enough to be extended later,
ok. I still need to read that mail.

> 
> >   
> > >   
> > > >     
> > > > > +
> > > > > +	ret = dma_set_mask_and_coherent(&cdev->dev,
> > > > > DMA_BIT_MASK(64));
> > > > > +	if (ret)
> > > > > +		ret = dma_set_mask_and_coherent(&cdev->dev,
> > > > > +
> > > > > DMA_BIT_MASK(32));
> > > > > +	if (ret) {
> > > > > +		dev_warn(&cdev->dev, "Failed to enable 64-bit
> > > > > or 32-bit DMA.  Trying to continue, but this might not
> > > > > work.\n");    
> > > > 
> > > > This does not look like you'd try to continue?
> > > >     
> > > 
> > > I remember now. First I did continue, then I changed this to fail
> > > hard so I can not ignore any such problems while smoke testing ('I
> > > don't always check the kernel messages'), but kept the old message.
> > > This basically should not fail anyway, otherwise we have a problem
> > > AFAIU.
> > > 
> > > By the way virtio-pci tries to continue indeed, and this is also
> > > where the wording comes from ;).
> > > 
> > > What would you prefer? Try to continue or fail right away?  
> > 
> > If it does not have a chance of working properly in the general case,
> > I'd fail.
> >   
> 
> Agreed! I will make it so. Would dropping '  Trying to continue, but
> this might not work.' from the warning message work for you?

Sounds fine.

> 
> I could also drop the attempt to set a 32 bit mask if you agree. Do you?

Only if you also drop it from the message as well ;)

Not sure in what cases you'll fail to set a 64 bit mask, but succeed
with a 32 bit mask. If there's no sensible situation where that might
happen, I'd just go ahead and drop it.

^ permalink raw reply

* Re: [PATCH net v8] failover: allow name change on IFF_UP slave interfaces
From: Samudrala, Sridhar @ 2019-04-09 16:02 UTC (permalink / raw)
  To: Si-Wei Liu, mst, stephen, davem, kubakici, alexander.duyck, jiri,
	netdev, virtualization
  Cc: boris.ostrovsky, liran.alon
In-Reply-To: <1554767127-32576-1-git-send-email-si-wei.liu@oracle.com>

On 4/8/2019 4:45 PM, Si-Wei Liu wrote:
> When a netdev appears through hot plug then gets enslaved by a failover
> master that is already up and running, the slave will be opened
> right away after getting enslaved. Today there's a race that userspace
> (udev) may fail to rename the slave if the kernel (net_failover)
> opens the slave earlier than when the userspace rename happens.
> Unlike bond or team, the primary slave of failover can't be renamed by
> userspace ahead of time, since the kernel initiated auto-enslavement is
> unable to, or rather, is never meant to be synchronized with the rename
> request from userspace.
> 
> As the failover slave interfaces are not designed to be operated
> directly by userspace apps: IP configuration, filter rules with
> regard to network traffic passing and etc., should all be done on master
> interface. In general, userspace apps only care about the
> name of master interface, while slave names are less important as long
> as admin users can see reliable names that may carry
> other information describing the netdev. For e.g., they can infer that
> "ens3nsby" is a standby slave of "ens3", while for a
> name like "eth0" they can't tell which master it belongs to.
> 
> Historically the name of IFF_UP interface can't be changed because
> there might be admin script or management software that is already
> relying on such behavior and assumes that the slave name can't be
> changed once UP. But failover is special: with the in-kernel
> auto-enslavement mechanism, the userspace expectation for device
> enumeration and bring-up order is already broken. Previously initramfs
> and various userspace config tools were modified to bypass failover
> slaves because of auto-enslavement and duplicate MAC address. Similarly,
> in case that users care about seeing reliable slave name, the new type
> of failover slaves needs to be taken care of specifically in userspace
> anyway.
> 
> It's less risky to lift up the rename restriction on failover slave
> which is already UP. Although it's possible this change may potentially
> break userspace component (most likely configuration scripts or
> management software) that assumes slave name can't be changed while
> UP, it's relatively a limited and controllable set among all userspace
> components, which can be fixed specifically to listen for the rename
> events on failover slaves. Userspace component interacting with slaves
> is expected to be changed to operate on failover master interface
> instead, as the failover slave is dynamic in nature which may come and
> go at any point.  The goal is to make the role of failover slaves less
> relevant, and userspace components should only deal with failover master
> in the long run.
> 
> Fixes: 30c8bd5aa8b2 ("net: Introduce generic failover module")
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> Reviewed-by: Liran Alon <liran.alon@oracle.com>

Acked-by: Sridhar Samudrala <sridhar.samudrala@intel.com>

^ permalink raw reply

* Re: [PATCH net v8] failover: allow name change on IFF_UP slave interfaces
From: Michael S. Tsirkin @ 2019-04-09 16:13 UTC (permalink / raw)
  To: Si-Wei Liu
  Cc: jiri, kubakici, sridhar.samudrala, alexander.duyck,
	virtualization, liran.alon, netdev, boris.ostrovsky, davem
In-Reply-To: <1554767127-32576-1-git-send-email-si-wei.liu@oracle.com>

On Mon, Apr 08, 2019 at 07:45:27PM -0400, Si-Wei Liu wrote:
> When a netdev appears through hot plug then gets enslaved by a failover
> master that is already up and running, the slave will be opened
> right away after getting enslaved. Today there's a race that userspace
> (udev) may fail to rename the slave if the kernel (net_failover)
> opens the slave earlier than when the userspace rename happens.
> Unlike bond or team, the primary slave of failover can't be renamed by
> userspace ahead of time, since the kernel initiated auto-enslavement is
> unable to, or rather, is never meant to be synchronized with the rename
> request from userspace.
> 
> As the failover slave interfaces are not designed to be operated
> directly by userspace apps: IP configuration, filter rules with
> regard to network traffic passing and etc., should all be done on master
> interface. In general, userspace apps only care about the
> name of master interface, while slave names are less important as long
> as admin users can see reliable names that may carry
> other information describing the netdev. For e.g., they can infer that
> "ens3nsby" is a standby slave of "ens3", while for a
> name like "eth0" they can't tell which master it belongs to.
> 
> Historically the name of IFF_UP interface can't be changed because
> there might be admin script or management software that is already
> relying on such behavior and assumes that the slave name can't be
> changed once UP. But failover is special: with the in-kernel
> auto-enslavement mechanism, the userspace expectation for device
> enumeration and bring-up order is already broken. Previously initramfs
> and various userspace config tools were modified to bypass failover
> slaves because of auto-enslavement and duplicate MAC address. Similarly,
> in case that users care about seeing reliable slave name, the new type
> of failover slaves needs to be taken care of specifically in userspace
> anyway.
> 
> It's less risky to lift up the rename restriction on failover slave
> which is already UP. Although it's possible this change may potentially
> break userspace component (most likely configuration scripts or
> management software) that assumes slave name can't be changed while
> UP, it's relatively a limited and controllable set among all userspace
> components, which can be fixed specifically to listen for the rename
> events on failover slaves. Userspace component interacting with slaves
> is expected to be changed to operate on failover master interface
> instead, as the failover slave is dynamic in nature which may come and
> go at any point.  The goal is to make the role of failover slaves less
> relevant, and userspace components should only deal with failover master
> in the long run.
> 
> Fixes: 30c8bd5aa8b2 ("net: Introduce generic failover module")
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> Reviewed-by: Liran Alon <liran.alon@oracle.com>


OK let's start with that. We can always add more events.

> --
> v1 -> v2:
> - Drop configurable module parameter (Sridhar)
> 
> v2 -> v3:
> - Drop additional IFF_SLAVE_RENAME_OK flag (Sridhar)
> - Send down and up events around rename (Michael S. Tsirkin)
> 
> v3 -> v4:
> - Simplify notification to be sent (Stephen Hemminger)
> 
> v4 -> v5:
> - Sync up code with latest net-next (Sridhar)
> - Use proper structure initialization (Stephen, Jiri)
> 
> v5 -> v6:
> - Make the property of live name change a generic flag (Stephen)
> 
> v6 -> v7:
> - Remove NETDEV_CHANGE notification that is deemed unnecessary
>   (Stephen)
> 
> v7 -> v8:
> - Fix commit message that references link up/down events still
>   (Michael)
> ---
>  include/linux/netdevice.h |  3 +++
>  net/core/dev.c            | 16 +++++++++++++++-
>  net/core/failover.c       |  6 +++---
>  3 files changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 78f5ec4e..ea9a63f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1498,6 +1498,7 @@ struct net_device_ops {
>   * @IFF_FAILOVER: device is a failover master device
>   * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
>   * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
> + * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
>   */
>  enum netdev_priv_flags {
>  	IFF_802_1Q_VLAN			= 1<<0,
> @@ -1530,6 +1531,7 @@ enum netdev_priv_flags {
>  	IFF_FAILOVER			= 1<<27,
>  	IFF_FAILOVER_SLAVE		= 1<<28,
>  	IFF_L3MDEV_RX_HANDLER		= 1<<29,
> +	IFF_LIVE_RENAME_OK		= 1<<30,
>  };
>  
>  #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
> @@ -1561,6 +1563,7 @@ enum netdev_priv_flags {
>  #define IFF_FAILOVER			IFF_FAILOVER
>  #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
>  #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
> +#define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
>  
>  /**
>   *	struct net_device - The DEVICE structure.
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9823b77..1622d88 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1185,7 +1185,21 @@ int dev_change_name(struct net_device *dev, const char *newname)
>  	BUG_ON(!dev_net(dev));
>  
>  	net = dev_net(dev);
> -	if (dev->flags & IFF_UP)
> +
> +	/* Some auto-enslaved devices e.g. failover slaves are
> +	 * special, as userspace might rename the device after
> +	 * the interface had been brought up and running since
> +	 * the point kernel initiated auto-enslavement. Allow
> +	 * live name change even when these slave devices are
> +	 * up and running.
> +	 *
> +	 * Typically, users of these auto-enslaving devices
> +	 * don't actually care about slave name change, as
> +	 * they are supposed to operate on master interface
> +	 * directly.
> +	 */
> +	if (dev->flags & IFF_UP &&
> +	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
>  		return -EBUSY;
>  
>  	write_seqcount_begin(&devnet_rename_seq);
> diff --git a/net/core/failover.c b/net/core/failover.c
> index 4a92a98..b5cd3c7 100644
> --- a/net/core/failover.c
> +++ b/net/core/failover.c
> @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev)
>  		goto err_upper_link;
>  	}
>  
> -	slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
> +	slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
>  
>  	if (fops && fops->slave_register &&
>  	    !fops->slave_register(slave_dev, failover_dev))
>  		return NOTIFY_OK;
>  
>  	netdev_upper_dev_unlink(slave_dev, failover_dev);
> -	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
> +	slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
>  err_upper_link:
>  	netdev_rx_handler_unregister(slave_dev);
>  done:
> @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev)
>  
>  	netdev_rx_handler_unregister(slave_dev);
>  	netdev_upper_dev_unlink(slave_dev, failover_dev);
> -	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
> +	slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK);
>  
>  	if (fops && fops->slave_unregister &&
>  	    !fops->slave_unregister(slave_dev, failover_dev))
> -- 
> 1.8.3.1

^ permalink raw reply

* Re: [RFC PATCH 04/12] s390/cio: introduce cio DMA pool
From: Cornelia Huck @ 2019-04-09 17:14 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190409141114.7dcce94a@oc2783563651>

On Tue, 9 Apr 2019 14:11:14 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 9 Apr 2019 12:44:58 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Fri,  5 Apr 2019 01:16:14 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:

> > > @@ -886,6 +888,8 @@ static const struct attribute_group *cssdev_attr_groups[] = {
> > >  	NULL,
> > >  };
> > >  
> > > +static u64 css_dev_dma_mask = DMA_BIT_MASK(31);
> > > +
> > >  static int __init setup_css(int nr)
> > >  {
> > >  	struct channel_subsystem *css;
> > > @@ -899,6 +903,9 @@ static int __init setup_css(int nr)
> > >  	dev_set_name(&css->device, "css%x", nr);
> > >  	css->device.groups = cssdev_attr_groups;
> > >  	css->device.release = channel_subsystem_release;
> > > +	/* some cio DMA memory needs to be 31 bit addressable */
> > > +	css->device.coherent_dma_mask = DMA_BIT_MASK(31),
> > > +	css->device.dma_mask = &css_dev_dma_mask;  
> > 
> > Question: Does this percolate down to the child devices eventually?
> > E.g., you have a ccw_device getting the mask from its parent
> > subchannel, which gets it from its parent css? If so, does that clash
> > with the the mask you used for the virtio_ccw_device in a previous
> > patch? Or are they two really different things?
> >   
> 
> AFAIK id does not percolate. I could not find anything showing in this
> direction in drivers core at least. AFAIU dma_mask is also about the
> addressing limitations of the device itself (in the good old pci world,
> not really applicable for ccw devices).
> 
> Regarding virtio_ccw, I need to set the DMA stuff on the parent device
> because that is how the stuff in virtio_ring.c works. There I we can
> get away with DMA_BIT_MASK(64) as that stuff is not used in places where
> 31 bit addresses are required. For the actual ccw stuff I used the
> cio DMA pool introduced here.

Ok, so those are two different users then.

> 
> FWIW the allocation mechanisms provided by  DMA API (as documented) is
> not very well suited with what we need for ccw. This is why I choose to
> do this gen_pool thing. The gist of it is as-is at the moment we get
> page granularity allocations from DMA API. Of course we could use
> dma_pool which is kind of perfect for uniformly sized objects. As you
> will see in the rest of the series, we have a comparatively small number
> of smallish objects of varying sizes. And some of these are short lived.
> 
> So the child devices like subchannels and ccw devices do not use DMA API
> directly, except for virtio_ccw.
> 
> Does all that make any sense to you?

I think I need to read more of the series, but that should be enough
info to get me started :)

> 
> 
> > >  
> > >  	mutex_init(&css->mutex);
> > >  	css->cssid = chsc_get_cssid(nr);
> > > @@ -1018,6 +1025,55 @@ static struct notifier_block css_power_notifier = {
> > >  	.notifier_call = css_power_event,
> > >  };
> > >  
> > > +#define POOL_INIT_PAGES 1
> > > +static struct gen_pool *cio_dma_pool;
> > > +/* Currently cio supports only a single css */
> > > +static struct device *cio_dma_css;  
> > 
> > That global variable feels wrong, especially if you plan to support
> > MCSS-E in the future. (Do you? :)   
> 
> Not that I'm aware of any plans to add support MCSS-E.
> 
> > If yes, should the dma pool be global
> > or per-css? As css0 currently is the root device for the channel
> > subsystem stuff, you'd either need a new parent to hang this off from
> > or size this with the number of css images.)
> > 
> > For now, just grab channel_subsystems[0]->device directly?
> >   
> 
> Patch 6 gets rid of this variable and adds an accessor instead:
> 
> +struct device *cio_get_dma_css_dev(void)
> +{
> +	return &channel_subsystems[0]->device;
> +}
> +
> 
> I can move that here if you like (for v1).

An accessor looks more sane than a global variable, yes.

> 
> Yes it is kind of unfortunate that some pieces of this code look
> like there could be more than one css, but actually MCSS-E is not
> supported. I would prefer sorting these problems out when they arise, if
> they ever arise.

As long as it's not too unreasonable, I think we should keep the
infrastructure for multiple css instances in place.

> 
> > > +static gfp_t  cio_dma_flags;
> > > +
> > > +static void __init cio_dma_pool_init(void)
> > > +{
> > > +	void *cpu_addr;
> > > +	dma_addr_t dma_addr;
> > > +	int i;
> > > +
> > > +	cio_dma_css = &channel_subsystems[0]->device;
> > > +	cio_dma_flags = GFP_DMA | GFP_KERNEL | __GFP_ZERO;
> > > +	cio_dma_pool = gen_pool_create(3, -1);
> > > +	/* No need to free up the resources: compiled in */
> > > +	for (i = 0; i < POOL_INIT_PAGES; ++i) {
> > > +		cpu_addr = dma_alloc_coherent(cio_dma_css, PAGE_SIZE, &dma_addr,
> > > +					      cio_dma_flags);
> > > +		if (!cpu_addr)
> > > +			return;
> > > +		gen_pool_add_virt(cio_dma_pool, (unsigned long) cpu_addr,
> > > +				  dma_addr, PAGE_SIZE, -1);
> > > +	}
> > > +
> > > +}
> > > +
> > > +void *cio_dma_zalloc(size_t size)
> > > +{
> > > +	dma_addr_t dma_addr;
> > > +	unsigned long addr = gen_pool_alloc(cio_dma_pool, size);
> > > +
> > > +	if (!addr) {
> > > +		addr = (unsigned long) dma_alloc_coherent(cio_dma_css,
> > > +					PAGE_SIZE, &dma_addr, cio_dma_flags);
> > > +		if (!addr)
> > > +			return NULL;
> > > +		gen_pool_add_virt(cio_dma_pool, addr, dma_addr, PAGE_SIZE, -1);
> > > +		addr = gen_pool_alloc(cio_dma_pool, size);
> > > +	}
> > > +	return (void *) addr;  
> > 
> > At this point, you're always going via the css0 device. I'm wondering
> > whether you should pass in the cssid here and use css_by_id(cssid) to
> > make this future proof. But then, I'm not quite clear from which
> > context this will be called.  
> 
> As said before I don't see MCSS-E coming. Regarding the client code,
> please check out the rest of the series. Especially patch 6. 
> 
> From my perspective it would be at this stage saner to make it more
> obvious that only one css is supported (at the moment), than to implement
> MCSS-E, or to make this (kind of) MCSS-E ready.

I disagree. I think there's value in keeping the interfaces clean
(within reasonable bounds, of course.) Even if there is no
implementation of MCSS-E other than QEMU... it is probably a good idea
to spend some brain cycles to make this conceptually clean.

^ permalink raw reply

* Re: [RFC PATCH 03/12] s390/mm: force swiotlb for protected virtualization
From: Cornelia Huck @ 2019-04-09 17:18 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190409125416.73713f23@oc2783563651>

On Tue, 9 Apr 2019 12:54:16 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> On Tue, 9 Apr 2019 12:16:47 +0200
> Cornelia Huck <cohuck@redhat.com> wrote:
> 
> > On Fri,  5 Apr 2019 01:16:13 +0200
> > Halil Pasic <pasic@linux.ibm.com> wrote:
> >   
> > > On s390 protected virtualization guests also have to use bounce I/O
> > > buffers.  That requires some plumbing.
> > > 
> > > Let us make sure any device using DMA API accordingly is spared from the  
>                     ^,                                   ^,
> Maybe this helps...
> 
> > > problems that hypervisor attempting I/O to a non-shared secure page would
> > > bring.  
> > 
> > I have problems parsing this sentence :(
> > 
> > Do you mean that we want to exclude pages for I/O from encryption?  
> 
> The intended meaning is:
> * Devices that do use DMA API (properly) to get get/map the memory
>   that is used to talk to hypervisor should be OK with PV (protected
>   virtualizaton). I.e. for such devices PV or not PV is basically
>   transparent.
> * But if a device does not use DMA API for the memory that is used to
>   talk to the hypervisor this patch won't help.
> 
> And yes the gist of it is: memory accessed by the hypervisor needs to
> be on pages excluded from protection (which in case of PV is technically
> not encryption).
> 
> Does that help?

Hm, let me sleep on this. The original sentence was a bit too
convoluted for me...

> 
> >   
> > > 
> > > Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> > > ---
> > >  arch/s390/Kconfig                   |  4 ++++
> > >  arch/s390/include/asm/Kbuild        |  1 -
> > >  arch/s390/include/asm/dma-mapping.h | 13 +++++++++++
> > >  arch/s390/include/asm/mem_encrypt.h | 18 +++++++++++++++
> > >  arch/s390/mm/init.c                 | 44 +++++++++++++++++++++++++++++++++++++
> > >  5 files changed, 79 insertions(+), 1 deletion(-)
> > >  create mode 100644 arch/s390/include/asm/dma-mapping.h
> > >  create mode 100644 arch/s390/include/asm/mem_encrypt.h  
> > 
> > (...)
> >   
> > > @@ -126,6 +129,45 @@ void mark_rodata_ro(void)
> > >  	pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
> > >  }
> > >  
> > > +int set_memory_encrypted(unsigned long addr, int numpages)
> > > +{
> > > +	/* also called for the swiotlb bounce buffers, make all pages shared */
> > > +	/* TODO: do ultravisor calls */
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL_GPL(set_memory_encrypted);
> > > +
> > > +int set_memory_decrypted(unsigned long addr, int numpages)
> > > +{
> > > +	/* also called for the swiotlb bounce buffers, make all pages shared */
> > > +	/* TODO: do ultravisor calls */
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL_GPL(set_memory_decrypted);
> > > +
> > > +/* are we a protected virtualization guest? */
> > > +bool sev_active(void)
> > > +{
> > > +	/*
> > > +	 * TODO: Do proper detection using ultravisor, for now let us fake we
> > > +	 *  have it so the code gets exercised.  
> > 
> > That's the swiotlb stuff, right?
> >   
> 
> You mean 'That' == code to get exercised == 'swiotlb stuff'?
> 
> If yes then the answer is kind of. The swiotlb (i.e. bounce buffers) is
> when we map (like we map the buffers pointed to by the descriptors in
> case of the virtio ring). The other part of it is the memory allocated
> as DMA coherent (i.e. the virtio ring (desc, avail used) itself).

Ok.

> 
> > (The patches will obviously need some reordering before it is actually
> > getting merged.)
> >   
> 
> What do you mean by reordering?
> 
> One reason why this is an early RFC is the missing dependency (i.e. the
> stuff described by most of the TODO comments). As pointed out in the
> cover letter. Another reason is that I wanted to avoid putting a lots of
> effort into fine-polishing before clarifying the getting some feedback
> on the basics from the community. ;)

Sure. I'm just reading top-down and unconditionally enabling this is
something that obviously needs to be changed in later iterations ;)

> 
> 
> > > +	 */
> > > +	return true;
> > > +}
> > > +EXPORT_SYMBOL_GPL(sev_active);
> > > +
> > > +/* protected virtualization */
> > > +static void pv_init(void)
> > > +{
> > > +	if (!sev_active())
> > > +		return;
> > > +
> > > +	/* make sure bounce buffers are shared */
> > > +	swiotlb_init(1);
> > > +	swiotlb_update_mem_attributes();
> > > +	swiotlb_force = SWIOTLB_FORCE;
> > > +}
> > > +
> > >  void __init mem_init(void)
> > >  {
> > >  	cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
> > > @@ -134,6 +176,8 @@ void __init mem_init(void)
> > >  	set_max_mapnr(max_low_pfn);
> > >          high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
> > >  
> > > +	pv_init();
> > > +
> > >  	/* Setup guest page hinting */
> > >  	cmma_init();
> > >    
> >   
> 

^ permalink raw reply

* Re: [RFC PATCH 05/12] s390/cio: add protected virtualization support to cio
From: Cornelia Huck @ 2019-04-09 17:55 UTC (permalink / raw)
  To: Halil Pasic
  Cc: Vasily Gorbik, linux-s390, Eric Farman, Claudio Imbrenda, kvm,
	Sebastian Ott, Farhan Ali, virtualization, Martin Schwidefsky,
	Viktor Mihajlovski, Janosch Frank
In-Reply-To: <20190404231622.52531-6-pasic@linux.ibm.com>

On Fri,  5 Apr 2019 01:16:15 +0200
Halil Pasic <pasic@linux.ibm.com> wrote:

> Virtio-ccw relies on cio mechanisms for bootstrapping the ccw device.

Well, a ccw device is, by definition, using cio mechanisms ;)

Better say: "As virtio-ccw devices are channel devices, we need to use
the dma area for any communication with the hypervisor."
Or something like that.

> Thus we need to make sure any memory that is used for communication with
> the hypervisor is shared.

In this context, does 'hypervisor' always mean 'QEMU/KVM'? If Other
Hypervisors implement protected virtualization, we probably need to
make sure that all common I/O layer control blocks are in the dma area
(including e.g. QDIO), not just what virtio-ccw devices use.

> 
> Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
> ---
>  drivers/s390/cio/ccwreq.c        |  8 +++----
>  drivers/s390/cio/device.c        | 46 +++++++++++++++++++++++++++++++---------
>  drivers/s390/cio/device_fsm.c    | 40 +++++++++++++++++-----------------
>  drivers/s390/cio/device_id.c     | 18 ++++++++--------
>  drivers/s390/cio/device_ops.c    |  4 ++--
>  drivers/s390/cio/device_pgid.c   | 20 ++++++++---------
>  drivers/s390/cio/device_status.c | 24 ++++++++++-----------
>  drivers/s390/cio/io_sch.h        | 19 ++++++++++++-----
>  8 files changed, 107 insertions(+), 72 deletions(-)
> 

(...)

(just looking at which fields are moved for now)

> diff --git a/drivers/s390/cio/io_sch.h b/drivers/s390/cio/io_sch.h
> index 90e4e3a7841b..fc3220fede0f 100644
> --- a/drivers/s390/cio/io_sch.h
> +++ b/drivers/s390/cio/io_sch.h
> @@ -9,15 +9,20 @@
>  #include "css.h"
>  #include "orb.h"
>  
> +
> +struct io_subchannel_dma_area {
> +	struct ccw1 sense_ccw;	/* static ccw for sense command */

The ccw makes sense.

> +};
> +
>  struct io_subchannel_private {
>  	union orb orb;		/* operation request block */
> -	struct ccw1 sense_ccw;	/* static ccw for sense command */
>  	struct ccw_device *cdev;/* pointer to the child ccw device */
>  	struct {
>  		unsigned int suspend:1;	/* allow suspend */
>  		unsigned int prefetch:1;/* deny prefetch */
>  		unsigned int inter:1;	/* suppress intermediate interrupts */
>  	} __packed options;
> +	struct io_subchannel_dma_area *dma_area;
>  } __aligned(8);
>  
>  #define to_io_private(n) ((struct io_subchannel_private *) \
> @@ -115,6 +120,13 @@ enum cdev_todo {
>  #define FAKE_CMD_IRB	1
>  #define FAKE_TM_IRB	2
>  
> +struct ccw_device_dma_area {
> +	struct senseid senseid;	/* SenseID info */
> +	struct ccw1 iccws[2];	/* ccws for SNID/SID/SPGID commands */
> +	struct irb irb;		/* device status */
> +	struct pgid pgid[8];	/* path group IDs per chpid*/

Again, ccws, and blocks that will be written by the hypervisor, which
makes sense as well.

> +};
> +
>  struct ccw_device_private {
>  	struct ccw_device *cdev;
>  	struct subchannel *sch;
> @@ -156,11 +168,7 @@ struct ccw_device_private {
>  	} __attribute__((packed)) flags;
>  	unsigned long intparm;	/* user interruption parameter */
>  	struct qdio_irq *qdio_data;
> -	struct irb irb;		/* device status */
>  	int async_kill_io_rc;
> -	struct senseid senseid;	/* SenseID info */
> -	struct pgid pgid[8];	/* path group IDs per chpid*/
> -	struct ccw1 iccws[2];	/* ccws for SNID/SID/SPGID commands */
>  	struct work_struct todo_work;
>  	enum cdev_todo todo;
>  	wait_queue_head_t wait_q;
> @@ -169,6 +177,7 @@ struct ccw_device_private {
>  	struct list_head cmb_list;	/* list of measured devices */
>  	u64 cmb_start_time;		/* clock value of cmb reset */
>  	void *cmb_wait;			/* deferred cmb enable/disable */
> +	struct ccw_device_dma_area *dma_area;
>  	enum interruption_class int_class;
>  };
>  

So, this leaves some things I'm not sure about, especially as I do not
know the architecture of this new feature.

- This applies only to asynchronously handled things, it seems? So
  things like control blocks modified by stsch/msch/etc does not need
  special treatment?
- What about channel measurements? Or are they not supported?
- What about CHSCs? Or would only asynchronous commands (which we
  currently don't implement in QEMU) need special treatment?

^ permalink raw reply

* [PATCH v5 0/6] virtio pmem driver
From: Pankaj Gupta @ 2019-04-10  4:08 UTC (permalink / raw)
  To: linux-nvdimm, linux-kernel, virtualization, kvm, linux-fsdevel,
	linux-acpi, qemu-devel, linux-ext4, linux-xfs
  Cc: pagupta, jack, mst, david, lcapitulino, adilger.kernel, zwisler,
	aarcange, dave.jiang, darrick.wong, vishal.l.verma, willy, hch,
	jmoyer, nilal, lenb, kilobyte, riel, yuval.shaia, stefanha,
	pbonzini, dan.j.williams, tytso, xiaoguangrong.eric, cohuck, rjw,
	imammedo

 This patch series has implementation for "virtio pmem". 
 "virtio pmem" is fake persistent memory(nvdimm) in guest 
 which allows to bypass the guest page cache. This also
 implements a VIRTIO based asynchronous flush mechanism.  
 
 Sharing guest kernel driver in this patchset with the 
 changes suggested in v4. Tested with Qemu side device 
 emulation [6] for virtio-pmem. 

 We have incorporated all the suggestions in V4. Documented 
 the impact of possible page cache side channel attacks with 
 suggested countermeasures.
 
 Details of project idea for 'virtio pmem' flushing interface 
 is shared [3] & [4].

 Implementation is divided into two parts:
 New virtio pmem guest driver and qemu code changes for new 
 virtio pmem paravirtualized device.

1. Guest virtio-pmem kernel driver
---------------------------------
   - Reads persistent memory range from paravirt device and 
     registers with 'nvdimm_bus'.  
   - 'nvdimm/pmem' driver uses this information to allocate 
     persistent memory region and setup filesystem operations 
     to the allocated memory. 
   - virtio pmem driver implements asynchronous flushing 
     interface to flush from guest to host.

2. Qemu virtio-pmem device
---------------------------------
   - Creates virtio pmem device and exposes a memory range to 
     KVM guest. 
   - At host side this is file backed memory which acts as 
     persistent memory. 
   - Qemu side flush uses aio thread pool API's and virtio 
     for asynchronous guest multi request handling. 

   David Hildenbrand CCed also posted a modified version[7] of 
   qemu virtio-pmem code based on updated Qemu memory device API. 

 Virtio-pmem security implications and countermeasures:
 -----------------------------------------------------

 In previous posting of kernel driver, there was discussion [9]
 on possible implications of page cache side channel attacks with 
 virtio pmem. After thorough analysis of details of known side 
 channel attacks, below are the suggestions:

 - Depends entirely on how host backing image file is mapped 
   into guest address space. 

 - virtio-pmem device emulation, by default shared mapping is used
   to map host backing file. It is recommended to use separate
   backing file at host side for every guest. This will prevent
   any possibility of executing common code from multiple guests
   and any chance of inferring guest local data based based on 
   execution time.

 - If backing file is required to be shared among multiple guests 
   it is recommended to don't support host page cache eviction 
   commands from the guest driver. This will avoid any possibility
   of inferring guest local data or host data from another guest. 

 - Proposed device specification [8] for virtio-pmem device with 
   details of possible security implications and suggested 
   countermeasures for device emulation.

 Virtio-pmem errors handling:
 ----------------------------------------
  Checked behaviour of virtio-pmem for below types of errors
  Need suggestions on expected behaviour for handling these errors?

  - Hardware Errors: Uncorrectable recoverable Errors: 
  a] virtio-pmem: 
    - As per current logic if error page belongs to Qemu process, 
      host MCE handler isolates(hwpoison) that page and send SIGBUS. 
      Qemu SIGBUS handler injects exception to KVM guest. 
    - KVM guest then isolates the page and send SIGBUS to guest 
      userspace process which has mapped the page. 
  
  b] Existing implementation for ACPI pmem driver: 
    - Handles such errors with MCE notifier and creates a list 
      of bad blocks. Read/direct access DAX operation return EIO 
      if accessed memory page fall in bad block list.
    - It also starts backgound scrubbing.  
    - Similar functionality can be reused in virtio-pmem with MCE 
      notifier but without scrubbing(no ACPI/ARS)? Need inputs to 
      confirm if this behaviour is ok or needs any change?

Changes from PATCH v4: [1]
- Factor out MAP_SYNC supported functionality to a common helper
				[Dave, Darrick, Jan]
- Comment, indentation and virtqueue_kick failure handle - Yuval Shaia

Changes from PATCH v3: [2]
- Use generic dax_synchronous() helper to check for DAXDEV_SYNC 
  flag - [Dan, Darrick, Jan]
- Add 'is_nvdimm_async' function
- Document page cache side channel attacks implications & 
  countermeasures - [Dave Chinner, Michael]

Changes from PATCH v2: 
- Disable MAP_SYNC for ext4 & XFS filesystems - [Dan] 
- Use name 'virtio pmem' in place of 'fake dax' 

Changes from PATCH v1: 
- 0-day build test for build dependency on libnvdimm 

 Changes suggested by - [Dan Williams]
- Split the driver into two parts virtio & pmem  
- Move queuing of async block request to block layer
- Add "sync" parameter in nvdimm_flush function
- Use indirect call for nvdimm_flush
- Don’t move declarations to common global header e.g nd.h
- nvdimm_flush() return 0 or -EIO if it fails
- Teach nsio_rw_bytes() that the flush can fail
- Rename nvdimm_flush() to generic_nvdimm_flush()
- Use 'nd_region->provider_data' for long dereferencing
- Remove virtio_pmem_freeze/restore functions
- Remove BSD license text with SPDX license text

- Add might_sleep() in virtio_pmem_flush - [Luiz]
- Make spin_lock_irqsave() narrow

Changes from RFC v3
- Rebase to latest upstream - Luiz
- Call ndregion->flush in place of nvdimm_flush- Luiz
- kmalloc return check - Luiz
- virtqueue full handling - Stefan
- Don't map entire virtio_pmem_req to device - Stefan
- request leak, correct sizeof req- Stefan
- Move declaration to virtio_pmem.c

Changes from RFC v2:
- Add flush function in the nd_region in place of switching
  on a flag - Dan & Stefan
- Add flush completion function with proper locking and wait
  for host side flush completion - Stefan & Dan
- Keep userspace API in uapi header file - Stefan, MST
- Use LE fields & New device id - MST
- Indentation & spacing suggestions - MST & Eric
- Remove extra header files & add licensing - Stefan

Changes from RFC v1:
- Reuse existing 'pmem' code for registering persistent 
  memory and other operations instead of creating an entirely 
  new block driver.
- Use VIRTIO driver to register memory information with 
  nvdimm_bus and create region_type accordingly. 
- Call VIRTIO flush from existing pmem driver.

Pankaj Gupta (6):
   libnvdimm: nd_region flush callback support
   virtio-pmem: Add virtio-pmem guest driver
   libnvdimm: add nd_region buffered dax_dev flag
   dax: check synchronous mapping is supported
   ext4: disable map_sync for virtio pmem
   xfs: disable map_sync for virtio pmem

[1] https://lkml.org/lkml/2019/4/3/394
[2] https://lkml.org/lkml/2019/1/9/471
[3] https://www.spinics.net/lists/kvm/msg149761.html
[4] https://www.spinics.net/lists/kvm/msg153095.html  
[5] https://lkml.org/lkml/2018/8/31/413
[6] https://marc.info/?l=linux-kernel&m=153572228719237&w=2 
[7] https://marc.info/?l=qemu-devel&m=153555721901824&w=2
[8] https://lists.oasis-open.org/archives/virtio-dev/201903/msg00083.html
[9] https://lkml.org/lkml/2019/1/9/1191

 drivers/acpi/nfit/core.c         |    4 -
 drivers/dax/bus.c                |    2 
 drivers/dax/super.c              |   13 +++-
 drivers/md/dm.c                  |    2 
 drivers/nvdimm/claim.c           |    6 +
 drivers/nvdimm/nd.h              |    1 
 drivers/nvdimm/pmem.c            |   17 +++--
 drivers/nvdimm/region_devs.c     |   45 +++++++++++++-
 drivers/nvdimm/virtio_pmem.c     |   88 +++++++++++++++++++++++++++
 drivers/virtio/Kconfig           |   10 +++
 drivers/virtio/Makefile          |    1 
 drivers/virtio/pmem.c            |  124 +++++++++++++++++++++++++++++++++++++++
 fs/ext4/file.c                   |   11 +--
 fs/xfs/xfs_file.c                |   10 +--
 include/linux/dax.h              |   32 +++++++++-
 include/linux/libnvdimm.h        |    9 ++
 include/linux/virtio_pmem.h      |   60 ++++++++++++++++++
 include/uapi/linux/virtio_ids.h  |    1 
 include/uapi/linux/virtio_pmem.h |   10 +++
 19 files changed, 419 insertions(+), 27 deletions(-)

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH v5 1/6] libnvdimm: nd_region flush callback support
From: Pankaj Gupta @ 2019-04-10  4:08 UTC (permalink / raw)
  To: linux-nvdimm, linux-kernel, virtualization, kvm, linux-fsdevel,
	linux-acpi, qemu-devel, linux-ext4, linux-xfs
  Cc: pagupta, jack, mst, david, lcapitulino, adilger.kernel, zwisler,
	aarcange, dave.jiang, darrick.wong, vishal.l.verma, willy, hch,
	jmoyer, nilal, lenb, kilobyte, riel, yuval.shaia, stefanha,
	pbonzini, dan.j.williams, tytso, xiaoguangrong.eric, cohuck, rjw,
	imammedo
In-Reply-To: <20190410040826.24371-1-pagupta@redhat.com>

This patch adds functionality to perform flush from guest
to host over VIRTIO. We are registering a callback based
on 'nd_region' type. virtio_pmem driver requires this special
flush function. For rest of the region types we are registering
existing flush function. Report error returned by host fsync
failure to userspace.

This also handles asynchronous flush requests from the block layer
by creating a child bio and chaining it with parent bio.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
---
 drivers/acpi/nfit/core.c     |  4 ++--
 drivers/nvdimm/claim.c       |  6 ++++--
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/pmem.c        | 14 ++++++++-----
 drivers/nvdimm/region_devs.c | 38 ++++++++++++++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  8 +++++++-
 6 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 5a389a4f4f65..567017a2190e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2434,7 +2434,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
 		offset = to_interleave_offset(offset, mmio);
 
 	writeq(cmd, mmio->addr.base + offset);
-	nvdimm_flush(nfit_blk->nd_region);
+	nvdimm_flush(nfit_blk->nd_region, NULL, false);
 
 	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
@@ -2483,7 +2483,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 	}
 
 	if (rw)
-		nvdimm_flush(nfit_blk->nd_region);
+		nvdimm_flush(nfit_blk->nd_region, NULL, false);
 
 	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
 	return rc;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index fb667bf469c7..a1dfa066786b 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -263,7 +263,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
 	sector_t sector = offset >> 9;
-	int rc = 0;
+	int rc = 0, ret = 0;
 
 	if (unlikely(!size))
 		return 0;
@@ -301,7 +301,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	memcpy_flushcache(nsio->addr + offset, buf, size);
-	nvdimm_flush(to_nd_region(ndns->dev.parent));
+	ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL, false);
+	if (ret)
+		rc = ret;
 
 	return rc;
 }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a5ac3b240293..916cd6c5451a 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -159,6 +159,7 @@ struct nd_region {
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
+	int (*flush)(struct nd_region *nd_region);
 	struct nd_mapping mapping[0];
 };
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index bc2f700feef8..5a5b3ea4d073 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -192,6 +192,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
+	int ret = 0;
 	blk_status_t rc = 0;
 	bool do_acct;
 	unsigned long start;
@@ -201,7 +202,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	struct nd_region *nd_region = to_region(pmem);
 
 	if (bio->bi_opf & REQ_PREFLUSH)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio, true);
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -216,7 +217,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 		nd_iostat_end(bio, start);
 
 	if (bio->bi_opf & REQ_FUA)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio, true);
+
+	if (ret)
+		bio->bi_status = errno_to_blk_status(ret);
 
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
@@ -463,13 +467,13 @@ static int pmem_attach_disk(struct device *dev,
 	disk->bb = &pmem->bb;
 
 	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+
 	if (!dax_dev) {
 		put_disk(disk);
 		return -ENOMEM;
 	}
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	pmem->dax_dev = dax_dev;
-
 	gendev = disk_to_dev(disk);
 	gendev->groups = pmem_attribute_groups;
 
@@ -527,14 +531,14 @@ static int nd_pmem_remove(struct device *dev)
 		sysfs_put(pmem->bb_state);
 		pmem->bb_state = NULL;
 	}
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL, false);
 
 	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
 {
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL, false);
 }
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b4ef7d9ff22e..fb1041ab32a6 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -295,7 +295,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att
 		return rc;
 	if (!flush)
 		return -EINVAL;
-	nvdimm_flush(nd_region);
+	rc = nvdimm_flush(nd_region, NULL, false);
+	if (rc)
+		return rc;
 
 	return len;
 }
@@ -1085,6 +1087,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	dev->of_node = ndr_desc->of_node;
 	nd_region->ndr_size = resource_size(ndr_desc->res);
 	nd_region->ndr_start = ndr_desc->res->start;
+	if (ndr_desc->flush)
+		nd_region->flush = ndr_desc->flush;
+	else
+		nd_region->flush = generic_nvdimm_flush;
+
 	nd_device_register(dev);
 
 	return nd_region;
@@ -1125,11 +1132,36 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio, bool async)
+{
+	int rc = 0;
+
+	/* Create child bio for asynchronous flush and chain with
+	 * parent bio. Otherwise directly call nd_region flush.
+	 */
+	if (async && bio->bi_iter.bi_sector != -1) {
+
+		struct bio *child = bio_alloc(GFP_ATOMIC, 0);
+
+		if (!child)
+			return -ENOMEM;
+		bio_copy_dev(child, bio);
+		child->bi_opf = REQ_PREFLUSH;
+		child->bi_iter.bi_sector = -1;
+		bio_chain(child, bio);
+		submit_bio(child);
+	} else {
+		if (nd_region->flush(nd_region))
+			rc = -EIO;
+	}
+
+	return rc;
+}
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
  * @nd_region: blk or interleaved pmem region
  */
-void nvdimm_flush(struct nd_region *nd_region)
+int generic_nvdimm_flush(struct nd_region *nd_region)
 {
 	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
 	int i, idx;
@@ -1153,6 +1185,8 @@ void nvdimm_flush(struct nd_region *nd_region)
 		if (ndrd_get_flush_wpq(ndrd, i, 0))
 			writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
 	wmb();
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_flush);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index feb342d026f2..d9d2ab8a6e64 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -65,6 +65,9 @@ enum {
 	 */
 	ND_REGION_PERSIST_MEMCTRL = 2,
 
+	/* Platform provides asynchronous flush mechanism */
+	ND_REGION_ASYNC = 3,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -121,6 +124,7 @@ struct nd_mapping_desc {
 	int position;
 };
 
+struct nd_region;
 struct nd_region_desc {
 	struct resource *res;
 	struct nd_mapping_desc *mapping;
@@ -133,6 +137,7 @@ struct nd_region_desc {
 	int target_node;
 	unsigned long flags;
 	struct device_node *of_node;
+	int (*flush)(struct nd_region *nd_region);
 };
 
 struct device;
@@ -260,7 +265,8 @@ unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
-void nvdimm_flush(struct nd_region *nd_region);
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio, bool async);
+int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
-- 
2.20.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox