All of lore.kernel.org
 help / color / mirror / Atom feed
From: zhoucm1 <david1.zhou-5C7GfCeVMHo@public.gmane.org>
To: "Marek Olšák" <maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
	mesa-dev-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Subject: Re: [Mesa-dev] [RFC PATCH] radeonsi: set a per-buffer flag that disables inter-process sharing (v2)
Date: Wed, 19 Jul 2017 15:35:51 +0800	[thread overview]
Message-ID: <596F0BD7.3070805@amd.com> (raw)
In-Reply-To: <1500408524-925-1-git-send-email-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>



On 2017年07月19日 04:08, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak@amd.com>
>
> For lower overhead in the CS ioctl.
> Winsys allocators are not used with interprocess-sharable resources.
Hi Marek,

Could I know from how your this way reduces overhead in CS ioctl? 
reusing BO to short bo list?

Thanks,
David Zhou
>
> v2: It shouldn't crash anymore, but the kernel will reject the new flag.
> ---
>   src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>   src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
>   src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36 ++++++++++++++++---------
>   src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
>   4 files changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
> index dd1c209..2747ac4 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen,
>   	}
>   
>   	/* Tiled textures are unmappable. Always put them in VRAM. */
>   	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
>   	    res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>   		res->domains = RADEON_DOMAIN_VRAM;
>   		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>   			 RADEON_FLAG_GTT_WC;
>   	}
>   
> +	/* Only displayable single-sample textures can be shared between
> +	 * processes. */
> +	if (res->b.b.target == PIPE_BUFFER ||
> +	    res->b.b.nr_samples >= 2 ||
> +	    rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
> +		res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>   	/* If VRAM is just stolen system memory, allow both VRAM and
>   	 * GTT, whichever has free space. If a buffer is evicted from
>   	 * VRAM to GTT, it will stay there.
>   	 *
>   	 * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
>   	 * placements even with a low amount of stolen VRAM.
>   	 */
>   	if (!rscreen->info.has_dedicated_vram &&
>   	    (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
>   	    res->domains == RADEON_DOMAIN_VRAM) {
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
> index 351edcd..0abcb56 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>       RADEON_DOMAIN_GTT  = 2,
>       RADEON_DOMAIN_VRAM = 4,
>       RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT
>   };
>   
>   enum radeon_bo_flag { /* bitfield */
>       RADEON_FLAG_GTT_WC =        (1 << 0),
>       RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>       RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>       RADEON_FLAG_SPARSE =        (1 << 3),
> +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>   };
>   
>   enum radeon_bo_usage { /* bitfield */
>       RADEON_USAGE_READ = 2,
>       RADEON_USAGE_WRITE = 4,
>       RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
>   
>       /* The winsys ensures that the CS submission will be scheduled after
>        * previously flushed CSs referencing this BO in a conflicting way.
>        */
> @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea
>       default:
>           assert(0);
>           return (enum radeon_bo_domain)0;
>       }
>   }
>   
>   static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> -        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_CPU_ACCESS |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_VRAM:
>       case RADEON_HEAP_VRAM_GTT:
>       case RADEON_HEAP_GTT_WC:
> -        return RADEON_FLAG_GTT_WC;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_GTT:
>       default:
> -        return 0;
> +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>       }
>   }
>   
>   /* The pb cache bucket is chosen to minimize pb_cache misses.
>    * It must be between 0 and 3 inclusive.
>    */
>   static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> @@ -724,22 +730,28 @@ static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>   
>   /* Return the heap index for winsys allocators, or -1 on failure. */
>   static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
>                                           enum radeon_bo_flag flags)
>   {
>       /* VRAM implies WC (write combining) */
>       assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>       /* NO_CPU_ACCESS implies VRAM only. */
>       assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM);
>   
> +    /* Resources with interprocess sharing don't use any winsys allocators. */
> +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
> +        return -1;
> +
>       /* Unsupported flags: NO_SUBALLOC, SPARSE. */
> -    if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
> +    if (flags & ~(RADEON_FLAG_GTT_WC |
> +                  RADEON_FLAG_NO_CPU_ACCESS |
> +                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
>           return -1;
>   
>       switch (domain) {
>       case RADEON_DOMAIN_VRAM:
>           if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>               return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>           else
>               return RADEON_HEAP_VRAM;
>       case RADEON_DOMAIN_VRAM_GTT:
>           return RADEON_HEAP_VRAM_GTT;
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 97bbe23..06b8198 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -31,20 +31,24 @@
>   
>   #include "amdgpu_cs.h"
>   
>   #include "os/os_time.h"
>   #include "state_tracker/drm_driver.h"
>   #include <amdgpu_drm.h>
>   #include <xf86drm.h>
>   #include <stdio.h>
>   #include <inttypes.h>
>   
> +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
> +#endif
> +
>   /* Set to 1 for verbose output showing committed sparse buffer ranges. */
>   #define DEBUG_SPARSE_COMMITS 0
>   
>   struct amdgpu_sparse_backing_chunk {
>      uint32_t begin, end;
>   };
>   
>   static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
> @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
>   
>      if (initial_domain & RADEON_DOMAIN_VRAM)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>      if (initial_domain & RADEON_DOMAIN_GTT)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>   
>      if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>         request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>      if (flags & RADEON_FLAG_GTT_WC)
>         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
> +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
> +      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>   
>      r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>      if (r) {
>         fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>         fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
>         fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
>         fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
>         goto error_bo_alloc;
>      }
>   
> @@ -1127,21 +1133,21 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>   
>   static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
>                    unsigned alignment,
>                    enum radeon_bo_domain domain,
>                    enum radeon_bo_flag flags)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> -   unsigned usage = 0, pb_cache_bucket;
> +   unsigned usage = 0, pb_cache_bucket = 0;
>   
>      /* VRAM implies WC. This is not optional. */
>      assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>   
>      /* NO_CPU_ACCESS is valid with VRAM only. */
>      assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
>   
>      /* Sub-allocate small buffers from slabs. */
>      if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
>          size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
> @@ -1182,48 +1188,52 @@ no_slab:
>      /* This flag is irrelevant for the cache. */
>      flags &= ~RADEON_FLAG_NO_SUBALLOC;
>   
>      /* Align size to page size. This is the minimum alignment for normal
>       * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
>       * like constant/uniform buffers, can benefit from better and more reuse.
>       */
>      size = align64(size, ws->info.gart_page_size);
>      alignment = align(alignment, ws->info.gart_page_size);
>   
> -   int heap = radeon_get_heap_index(domain, flags);
> -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -   usage = 1 << heap; /* Only set one usage bit for each heap. */
> +   bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   
> -   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +   if (use_reusable_pool) {
> +       int heap = radeon_get_heap_index(domain, flags);
> +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +       usage = 1 << heap; /* Only set one usage bit for each heap. */
>   
> -   /* Get a buffer from the cache. */
> -   bo = (struct amdgpu_winsys_bo*)
> -        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
> -                                pb_cache_bucket);
> -   if (bo)
> -      return &bo->base;
> +       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +       assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +       /* Get a buffer from the cache. */
> +       bo = (struct amdgpu_winsys_bo*)
> +            pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
> +                                    pb_cache_bucket);
> +       if (bo)
> +          return &bo->base;
> +   }
>   
>      /* Create a new one. */
>      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                            pb_cache_bucket);
>      if (!bo) {
>         /* Clear the cache and try again. */
>         pb_slabs_reclaim(&ws->bo_slabs);
>         pb_cache_release_all_buffers(&ws->bo_cache);
>         bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                               pb_cache_bucket);
>         if (!bo)
>            return NULL;
>      }
>   
> -   bo->u.real.use_reusable_pool = true;
> +   bo->u.real.use_reusable_pool = use_reusable_pool;
>      return &bo->base;
>   }
>   
>   static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
>                                                  struct winsys_handle *whandle,
>                                                  unsigned *stride,
>                                                  unsigned *offset)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> index 8027a5f..15e9d38 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> @@ -907,21 +907,21 @@ static void radeon_bo_set_metadata(struct pb_buffer *_buf,
>   
>   static struct pb_buffer *
>   radeon_winsys_bo_create(struct radeon_winsys *rws,
>                           uint64_t size,
>                           unsigned alignment,
>                           enum radeon_bo_domain domain,
>                           enum radeon_bo_flag flags)
>   {
>       struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>       struct radeon_bo *bo;
> -    unsigned usage = 0, pb_cache_bucket;
> +    unsigned usage = 0, pb_cache_bucket = 0;
>   
>       assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>   
>       /* Only 32-bit sizes are supported. */
>       if (size > UINT_MAX)
>           return NULL;
>   
>       /* VRAM implies WC. This is not optional. */
>       if (domain & RADEON_DOMAIN_VRAM)
>           flags |= RADEON_FLAG_GTT_WC;
> @@ -962,46 +962,51 @@ no_slab:
>       /* This flag is irrelevant for the cache. */
>       flags &= ~RADEON_FLAG_NO_SUBALLOC;
>   
>       /* Align size to page size. This is the minimum alignment for normal
>        * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
>        * like constant/uniform buffers, can benefit from better and more reuse.
>        */
>       size = align(size, ws->info.gart_page_size);
>       alignment = align(alignment, ws->info.gart_page_size);
>   
> -    int heap = radeon_get_heap_index(domain, flags);
> -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -    usage = 1 << heap; /* Only set one usage bit for each heap. */
> +    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   
> -    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +    /* Shared resources don't use cached heaps. */
> +    if (use_reusable_pool) {
> +        int heap = radeon_get_heap_index(domain, flags);
> +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +        usage = 1 << heap; /* Only set one usage bit for each heap. */
>   
> -    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
> -                                           usage, pb_cache_bucket));
> -    if (bo)
> -        return &bo->base;
> +        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +        assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
> +                                               usage, pb_cache_bucket));
> +        if (bo)
> +            return &bo->base;
> +    }
>   
>       bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                             pb_cache_bucket);
>       if (!bo) {
>           /* Clear the cache and try again. */
>           if (ws->info.has_virtual_memory)
>               pb_slabs_reclaim(&ws->bo_slabs);
>           pb_cache_release_all_buffers(&ws->bo_cache);
>           bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                                 pb_cache_bucket);
>           if (!bo)
>               return NULL;
>       }
>   
> -    bo->u.real.use_reusable_pool = true;
> +    bo->u.real.use_reusable_pool = use_reusable_pool;
>   
>       mtx_lock(&ws->bo_handles_mutex);
>       util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
>       mtx_unlock(&ws->bo_handles_mutex);
>   
>       return &bo->base;
>   }
>   
>   static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
>                                                      void *pointer, uint64_t size)

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2017-07-19  7:35 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-07-18 20:08 [RFC PATCH] radeonsi: set a per-buffer flag that disables inter-process sharing (v2) Marek Olšák
     [not found] ` <1500408524-925-1-git-send-email-maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-07-18 21:11   ` Michel Dänzer
2017-07-18 21:32     ` Marek Olšák
2017-07-19  7:35   ` zhoucm1 [this message]
     [not found]     ` <CAAxE2A5ygzBpB-56BowZ+iwf-wgistwY4Xo-puKM=cQ2bbQyEw@mail.gmail.com>
     [not found]       ` <CAAxE2A5WYEdp7WSLB=UE82qER9aS05zfAQP+MTM6bjw+cOPJeQ@mail.gmail.com>
     [not found]         ` <CAAxE2A5WYEdp7WSLB=UE82qER9aS05zfAQP+MTM6bjw+cOPJeQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-19 15:34           ` [Mesa-dev] " Marek Olšák
     [not found]             ` <CAAxE2A60fT46jpntj=rCXTHrs0QXdNSFEnYdHVRuccd446+_pA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-20  2:20               ` zhoucm1
     [not found]                 ` <59701388.5080909-5C7GfCeVMHo@public.gmane.org>
2017-07-20 11:06                   ` Samuel Pitoiset
2017-07-20 14:59                   ` Marek Olšák
     [not found]                     ` <CAAxE2A6uFyqhWfJBNzojyMqFmk-AGyQmScwqScMjLutBQX5usA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-20 17:46                       ` Christian König
2017-07-21  3:14                       ` zhoucm1

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=596F0BD7.3070805@amd.com \
    --to=david1.zhou-5c7gfcevmho@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=maraeo-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=mesa-dev-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.