All of lore.kernel.org
 help / color / mirror / Atom feed
From: Marek Szyprowski <m.szyprowski@samsung.com>
To: Ricardo Ribalda <ribalda@chromium.org>,
	Robin Murphy <robin.murphy@arm.com>,
	Christoph Hellwig <hch@lst.de>,
	auro Carvalho Chehab <mchehab@kernel.org>,
	IOMMU DRIVERS <iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,
	Linux Doc Mailing List <linux-doc@vger.kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Linux Media Mailing List <linux-media@vger.kernel.org>,
	Tomasz Figa <tfiga@chromium.org>,
	Sergey Senozhatsky <senozhatsky@google.com>
Subject: Re: [PATCH v2 5/6] media: uvcvideo: Use dma_alloc_noncontiguos API
Date: Wed, 25 Nov 2020 22:30:28 +0100	[thread overview]
Message-ID: <527e4b4e-e631-adfa-e4fa-c777af82ae17@samsung.com> (raw)
In-Reply-To: <20201125203114.130967-1-ribalda@chromium.org>

Hi

On 25.11.2020 21:31, Ricardo Ribalda wrote:
> On architectures where the is no coherent caching such as ARM use the
> dma_alloc_noncontiguos API and handle manually the cache flushing using
> dma_sync_sg().
>
> With this patch on the affected architectures we can measure up to 20x
> performance improvement in uvc_video_copy_data_work().
>
> Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
> ---
>
> v2: Thanks to Robin!
>
> Use dma_sync_sg instead of dma_sync_single
>
>
>   drivers/media/usb/uvc/uvc_video.c | 83 ++++++++++++++++++++++++++-----
>   drivers/media/usb/uvc/uvcvideo.h  |  2 +
>   2 files changed, 73 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
> index a6a441d92b94..b2e6a9522999 100644
> --- a/drivers/media/usb/uvc/uvc_video.c
> +++ b/drivers/media/usb/uvc/uvc_video.c
> @@ -1490,6 +1490,11 @@ static void uvc_video_encode_bulk(struct uvc_urb *uvc_urb,
>   	urb->transfer_buffer_length = stream->urb_size - len;
>   }
>   
> +static inline struct device *stream_to_dmadev(struct uvc_streaming *stream)
> +{
> +	return stream->dev->udev->bus->controller->parent;
> +}
> +
>   static void uvc_video_complete(struct urb *urb)
>   {
>   	struct uvc_urb *uvc_urb = urb->context;
> @@ -1539,6 +1544,10 @@ static void uvc_video_complete(struct urb *urb)
>   	 * Process the URB headers, and optionally queue expensive memcpy tasks
>   	 * to be deferred to a work queue.
>   	 */
> +	if (uvc_urb->pages) {
> +		dma_sync_sg_for_cpu(stream_to_dmadev(stream), uvc_urb->sgt.sgl,
> +				    uvc_urb->sgt.nents,	DMA_FROM_DEVICE);

Please use dma_sync_sgtable_for_cpu(stream_to_dmadev(stream), 
&uvc_urb->sgt, DMA_FROM_DEVICE);

I also think that there should be a call to 
dma_sync_sgtable_for_device() before starting the potential DMA 
transfer. Just to make sure that CPU wont trash the newly captured data 
and for the API completeness.

> +	}
>   	stream->decode(uvc_urb, buf, buf_meta);
>   
>   	/* If no async work is needed, resubmit the URB immediately. */
> @@ -1566,8 +1575,16 @@ static void uvc_free_urb_buffers(struct uvc_streaming *stream)
>   			continue;
>   
>   #ifndef CONFIG_DMA_NONCOHERENT
> -		usb_free_coherent(stream->dev->udev, stream->urb_size,
> -				  uvc_urb->buffer, uvc_urb->dma);
> +		if (uvc_urb->pages) {
> +			sg_free_table(&uvc_urb->sgt);
> +			vunmap(uvc_urb->buffer);
> +			dma_free_noncontiguous(stream_to_dmadev(stream),
> +					       stream->urb_size,
> +					       uvc_urb->pages, uvc_urb->dma);
> +		} else {
> +			usb_free_coherent(stream->dev->udev, stream->urb_size,
> +					  uvc_urb->buffer, uvc_urb->dma);
> +		}
>   #else
>   		kfree(uvc_urb->buffer);
>   #endif
> @@ -1577,6 +1594,56 @@ static void uvc_free_urb_buffers(struct uvc_streaming *stream)
>   	stream->urb_size = 0;
>   }
>   
> +#ifndef CONFIG_DMA_NONCOHERENT
> +static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
> +				 struct uvc_urb *uvc_urb, gfp_t gfp_flags)
> +{
> +	struct device *dma_dev = dma_dev = stream_to_dmadev(stream);
> +
> +	if (!dma_can_alloc_noncontiguous(dma_dev)) {
> +		uvc_urb->buffer = usb_alloc_coherent(stream->dev->udev,
> +						     stream->urb_size,
> +						     gfp_flags | __GFP_NOWARN,
> +						     &uvc_urb->dma);
> +		return uvc_urb->buffer != NULL;
> +	}
> +
> +	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,
> +						 &uvc_urb->dma,
> +						 gfp_flags | __GFP_NOWARN, 0);
> +	if (!uvc_urb->pages)
> +		return false;
> +
> +	uvc_urb->buffer = vmap(uvc_urb->pages,
> +			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,
> +			       VM_DMA_COHERENT, PAGE_KERNEL);
> +	if (!uvc_urb->buffer) {
> +		dma_free_noncontiguous(dma_dev, stream->urb_size,
> +				       uvc_urb->pages, uvc_urb->dma);
> +		return false;
> +	}
> +
> +	if (sg_alloc_table_from_pages(&uvc_urb->sgt, uvc_urb->pages,
> +				PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT, 0,
> +				stream->urb_size, GFP_KERNEL)) {
> +		vunmap(uvc_urb->buffer);
> +		dma_free_noncontiguous(dma_dev, stream->urb_size,
> +				       uvc_urb->pages, uvc_urb->dma);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +#else
> +static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
> +				 struct uvc_urb *uvc_urb, gfp_t gfp_flags)
> +{
> +	uvc_urb->buffer = kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
> +
> +	return uvc_urb->buffer != NULL;
> +}
> +#endif
> +
>   /*
>    * Allocate transfer buffers. This function can be called with buffers
>    * already allocated when resuming from suspend, in which case it will
> @@ -1607,19 +1674,11 @@ static int uvc_alloc_urb_buffers(struct uvc_streaming *stream,
>   
>   	/* Retry allocations until one succeed. */
>   	for (; npackets > 1; npackets /= 2) {
> +		stream->urb_size = psize * npackets;
>   		for (i = 0; i < UVC_URBS; ++i) {
>   			struct uvc_urb *uvc_urb = &stream->uvc_urb[i];
>   
> -			stream->urb_size = psize * npackets;
> -#ifndef CONFIG_DMA_NONCOHERENT
> -			uvc_urb->buffer = usb_alloc_coherent(
> -				stream->dev->udev, stream->urb_size,
> -				gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
> -#else
> -			uvc_urb->buffer =
> -			    kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
> -#endif
> -			if (!uvc_urb->buffer) {
> +			if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) {
>   				uvc_free_urb_buffers(stream);
>   				break;
>   			}
> diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
> index a3dfacf069c4..3e6618a2ac82 100644
> --- a/drivers/media/usb/uvc/uvcvideo.h
> +++ b/drivers/media/usb/uvc/uvcvideo.h
> @@ -532,6 +532,8 @@ struct uvc_urb {
>   
>   	char *buffer;
>   	dma_addr_t dma;
> +	struct page **pages;
> +	struct sg_table sgt;
>   
>   	unsigned int async_operations;
>   	struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


WARNING: multiple messages have this Message-ID (diff)
From: Marek Szyprowski <m.szyprowski@samsung.com>
To: Ricardo Ribalda <ribalda@chromium.org>,
	Robin Murphy <robin.murphy@arm.com>,
	Christoph Hellwig <hch@lst.de>,
	auro Carvalho Chehab <mchehab@kernel.org>,
	IOMMU DRIVERS <iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,
	Linux Doc Mailing List <linux-doc@vger.kernel.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Linux Media Mailing List <linux-media@vger.kernel.org>,
	Tomasz Figa <tfiga@chromium.org>,
	Sergey Senozhatsky <senozhatsky@google.com>
Subject: Re: [PATCH v2 5/6] media: uvcvideo: Use dma_alloc_noncontiguos API
Date: Wed, 25 Nov 2020 22:30:28 +0100	[thread overview]
Message-ID: <527e4b4e-e631-adfa-e4fa-c777af82ae17@samsung.com> (raw)
In-Reply-To: <20201125203114.130967-1-ribalda@chromium.org>

Hi

On 25.11.2020 21:31, Ricardo Ribalda wrote:
> On architectures where the is no coherent caching such as ARM use the
> dma_alloc_noncontiguos API and handle manually the cache flushing using
> dma_sync_sg().
>
> With this patch on the affected architectures we can measure up to 20x
> performance improvement in uvc_video_copy_data_work().
>
> Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
> ---
>
> v2: Thanks to Robin!
>
> Use dma_sync_sg instead of dma_sync_single
>
>
>   drivers/media/usb/uvc/uvc_video.c | 83 ++++++++++++++++++++++++++-----
>   drivers/media/usb/uvc/uvcvideo.h  |  2 +
>   2 files changed, 73 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
> index a6a441d92b94..b2e6a9522999 100644
> --- a/drivers/media/usb/uvc/uvc_video.c
> +++ b/drivers/media/usb/uvc/uvc_video.c
> @@ -1490,6 +1490,11 @@ static void uvc_video_encode_bulk(struct uvc_urb *uvc_urb,
>   	urb->transfer_buffer_length = stream->urb_size - len;
>   }
>   
> +static inline struct device *stream_to_dmadev(struct uvc_streaming *stream)
> +{
> +	return stream->dev->udev->bus->controller->parent;
> +}
> +
>   static void uvc_video_complete(struct urb *urb)
>   {
>   	struct uvc_urb *uvc_urb = urb->context;
> @@ -1539,6 +1544,10 @@ static void uvc_video_complete(struct urb *urb)
>   	 * Process the URB headers, and optionally queue expensive memcpy tasks
>   	 * to be deferred to a work queue.
>   	 */
> +	if (uvc_urb->pages) {
> +		dma_sync_sg_for_cpu(stream_to_dmadev(stream), uvc_urb->sgt.sgl,
> +				    uvc_urb->sgt.nents,	DMA_FROM_DEVICE);

Please use dma_sync_sgtable_for_cpu(stream_to_dmadev(stream), 
&uvc_urb->sgt, DMA_FROM_DEVICE);

I also think that there should be a call to 
dma_sync_sgtable_for_device() before starting the potential DMA 
transfer. Just to make sure that CPU wont trash the newly captured data 
and for the API completeness.

> +	}
>   	stream->decode(uvc_urb, buf, buf_meta);
>   
>   	/* If no async work is needed, resubmit the URB immediately. */
> @@ -1566,8 +1575,16 @@ static void uvc_free_urb_buffers(struct uvc_streaming *stream)
>   			continue;
>   
>   #ifndef CONFIG_DMA_NONCOHERENT
> -		usb_free_coherent(stream->dev->udev, stream->urb_size,
> -				  uvc_urb->buffer, uvc_urb->dma);
> +		if (uvc_urb->pages) {
> +			sg_free_table(&uvc_urb->sgt);
> +			vunmap(uvc_urb->buffer);
> +			dma_free_noncontiguous(stream_to_dmadev(stream),
> +					       stream->urb_size,
> +					       uvc_urb->pages, uvc_urb->dma);
> +		} else {
> +			usb_free_coherent(stream->dev->udev, stream->urb_size,
> +					  uvc_urb->buffer, uvc_urb->dma);
> +		}
>   #else
>   		kfree(uvc_urb->buffer);
>   #endif
> @@ -1577,6 +1594,56 @@ static void uvc_free_urb_buffers(struct uvc_streaming *stream)
>   	stream->urb_size = 0;
>   }
>   
> +#ifndef CONFIG_DMA_NONCOHERENT
> +static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
> +				 struct uvc_urb *uvc_urb, gfp_t gfp_flags)
> +{
> +	struct device *dma_dev = dma_dev = stream_to_dmadev(stream);
> +
> +	if (!dma_can_alloc_noncontiguous(dma_dev)) {
> +		uvc_urb->buffer = usb_alloc_coherent(stream->dev->udev,
> +						     stream->urb_size,
> +						     gfp_flags | __GFP_NOWARN,
> +						     &uvc_urb->dma);
> +		return uvc_urb->buffer != NULL;
> +	}
> +
> +	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,
> +						 &uvc_urb->dma,
> +						 gfp_flags | __GFP_NOWARN, 0);
> +	if (!uvc_urb->pages)
> +		return false;
> +
> +	uvc_urb->buffer = vmap(uvc_urb->pages,
> +			       PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,
> +			       VM_DMA_COHERENT, PAGE_KERNEL);
> +	if (!uvc_urb->buffer) {
> +		dma_free_noncontiguous(dma_dev, stream->urb_size,
> +				       uvc_urb->pages, uvc_urb->dma);
> +		return false;
> +	}
> +
> +	if (sg_alloc_table_from_pages(&uvc_urb->sgt, uvc_urb->pages,
> +				PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT, 0,
> +				stream->urb_size, GFP_KERNEL)) {
> +		vunmap(uvc_urb->buffer);
> +		dma_free_noncontiguous(dma_dev, stream->urb_size,
> +				       uvc_urb->pages, uvc_urb->dma);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +#else
> +static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream,
> +				 struct uvc_urb *uvc_urb, gfp_t gfp_flags)
> +{
> +	uvc_urb->buffer = kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
> +
> +	return uvc_urb->buffer != NULL;
> +}
> +#endif
> +
>   /*
>    * Allocate transfer buffers. This function can be called with buffers
>    * already allocated when resuming from suspend, in which case it will
> @@ -1607,19 +1674,11 @@ static int uvc_alloc_urb_buffers(struct uvc_streaming *stream,
>   
>   	/* Retry allocations until one succeed. */
>   	for (; npackets > 1; npackets /= 2) {
> +		stream->urb_size = psize * npackets;
>   		for (i = 0; i < UVC_URBS; ++i) {
>   			struct uvc_urb *uvc_urb = &stream->uvc_urb[i];
>   
> -			stream->urb_size = psize * npackets;
> -#ifndef CONFIG_DMA_NONCOHERENT
> -			uvc_urb->buffer = usb_alloc_coherent(
> -				stream->dev->udev, stream->urb_size,
> -				gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
> -#else
> -			uvc_urb->buffer =
> -			    kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
> -#endif
> -			if (!uvc_urb->buffer) {
> +			if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) {
>   				uvc_free_urb_buffers(stream);
>   				break;
>   			}
> diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
> index a3dfacf069c4..3e6618a2ac82 100644
> --- a/drivers/media/usb/uvc/uvcvideo.h
> +++ b/drivers/media/usb/uvc/uvcvideo.h
> @@ -532,6 +532,8 @@ struct uvc_urb {
>   
>   	char *buffer;
>   	dma_addr_t dma;
> +	struct page **pages;
> +	struct sg_table sgt;
>   
>   	unsigned int async_operations;
>   	struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

  reply	other threads:[~2020-11-25 21:31 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20201125203125eucas1p1456f09a6d130e2f015707b4e5f9dbfc1@eucas1p1.samsung.com>
2020-11-25 20:31 ` [PATCH v2 5/6] media: uvcvideo: Use dma_alloc_noncontiguos API Ricardo Ribalda
2020-11-25 20:31   ` Ricardo Ribalda
2020-11-25 21:30   ` Marek Szyprowski [this message]
2020-11-25 21:30     ` Marek Szyprowski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=527e4b4e-e631-adfa-e4fa-c777af82ae17@samsung.com \
    --to=m.szyprowski@samsung.com \
    --cc=hch@lst.de \
    --cc=iommu@lists.linux-foundation.org \
    --cc=joro@8bytes.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-media@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=ribalda@chromium.org \
    --cc=robin.murphy@arm.com \
    --cc=senozhatsky@google.com \
    --cc=tfiga@chromium.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.