[PATCH] drm/amdgpu: Modify the contiguous flags behaviour

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
@ 2024-04-14 14:57 Arunpravin Paneer Selvam
  2024-04-15 14:03 ` Christian König
  2024-04-15 22:02 ` Philip Yang
  0 siblings, 2 replies; 8+ messages in thread
From: Arunpravin Paneer Selvam @ 2024-04-14 14:57 UTC (permalink / raw)
  To: amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang,
	Arunpravin Paneer Selvam

Now we have two flags for contiguous VRAM buffer allocation.
If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
buffer's placement function.

This patch will change the default behaviour of the two flags.

When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
- This means contiguous is not mandatory.
- we will try to allocate the contiguous buffer. Say if the
  allocation fails, we fallback to allocate the individual pages.

When we setTTM_PL_FLAG_CONTIGUOUS
- This means contiguous allocation is mandatory.
- we are setting this in amdgpu_bo_pin_restricted() before bo validation
  and check this flag in the vram manager file.
- if this is set, we should allocate the buffer pages contiguously.
  the allocation fails, we return -ENOSPC.

Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8bc79924d171..41926d631563 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
 		else
 			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
 
-		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
-			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
 		c++;
 	}
 
@@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 {
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	struct ttm_operation_ctx ctx = { false, false };
+	struct ttm_place *places = bo->placements;
+	u32 c = 0;
 	int r, i;
 
 	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
@@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 
 	if (bo->tbo.pin_count) {
 		uint32_t mem_type = bo->tbo.resource->mem_type;
-		uint32_t mem_flags = bo->tbo.resource->placement;
 
 		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
 			return -EINVAL;
 
-		if ((mem_type == TTM_PL_VRAM) &&
-		    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
-		    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
-			return -EINVAL;
-
 		ttm_bo_pin(&bo->tbo);
 
 		if (max_offset != 0) {
@@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 			bo->placements[i].lpfn = lpfn;
 	}
 
+	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
+	    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
+		places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
+
 	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 	if (unlikely(r)) {
 		dev_err(adev->dev, "%p pin failed\n", bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..ddbf302878f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
 	return size;
 }
 
+static inline unsigned long
+amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
+				 const struct ttm_place *place,
+				 unsigned long bo_flags)
+{
+	unsigned long pages_per_block;
+
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+	    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
+		pages_per_block = ~0ul;
+	} else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		pages_per_block = HPAGE_PMD_NR;
+#else
+		/* default to 2MB */
+		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
+#endif
+		pages_per_block = max_t(uint32_t, pages_per_block,
+					tbo->page_alignment);
+	}
+
+	return pages_per_block;
+}
+
 /**
  * DOC: mem_info_vram_total
  *
@@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
 	struct amdgpu_device *adev = to_amdgpu_device(mgr);
 	u64 vis_usage = 0, max_bytes, min_block_size;
+	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
 	struct amdgpu_vram_mgr_resource *vres;
 	u64 size, remaining_size, lpfn, fpfn;
+	unsigned long bo_flags = bo->flags;
 	struct drm_buddy *mm = &mgr->mm;
 	struct drm_buddy_block *block;
 	unsigned long pages_per_block;
@@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	if (tbo->type != ttm_bo_type_kernel)
 		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
 
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
-		pages_per_block = ~0ul;
-	} else {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		pages_per_block = HPAGE_PMD_NR;
-#else
-		/* default to 2MB */
-		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
-#endif
-		pages_per_block = max_t(uint32_t, pages_per_block,
-					tbo->page_alignment);
-	}
+	pages_per_block =
+		amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
 
 	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
 	if (!vres)
@@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	if (place->flags & TTM_PL_FLAG_TOPDOWN)
 		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
 
-	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
+	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
+	    place->flags & TTM_PL_FLAG_CONTIGUOUS)
 		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
 
 	if (fpfn || lpfn != mgr->mm.size)
@@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 					   min_block_size,
 					   &vres->blocks,
 					   vres->flags);
-		if (unlikely(r))
+		if (unlikely(r)) {
+			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+				/* Fallback to non contiguous allocation */
+				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
+				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+
+				pages_per_block =
+					amdgpu_vram_find_pages_per_block(tbo,
+									 place,
+									 bo_flags);
+				continue;
+			}
 			goto error_free_blocks;
+		}
 
 		if (size > remaining_size)
 			remaining_size = 0;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-14 14:57 [PATCH] drm/amdgpu: Modify the contiguous flags behaviour Arunpravin Paneer Selvam
@ 2024-04-15 14:03 ` Christian König
  2024-04-15 17:16   ` Paneer Selvam, Arunpravin
  2024-04-15 22:02 ` Philip Yang
  1 sibling, 1 reply; 8+ messages in thread
From: Christian König @ 2024-04-15 14:03 UTC (permalink / raw)
  To: Arunpravin Paneer Selvam, amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang

Am 14.04.24 um 16:57 schrieb Arunpravin Paneer Selvam:
> Now we have two flags for contiguous VRAM buffer allocation.
> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
> buffer's placement function.
>
> This patch will change the default behaviour of the two flags.
>
> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
> - This means contiguous is not mandatory.
> - we will try to allocate the contiguous buffer. Say if the
>    allocation fails, we fallback to allocate the individual pages.
>
> When we setTTM_PL_FLAG_CONTIGUOUS
> - This means contiguous allocation is mandatory.
> - we are setting this in amdgpu_bo_pin_restricted() before bo validation
>    and check this flag in the vram manager file.
> - if this is set, we should allocate the buffer pages contiguously.
>    the allocation fails, we return -ENOSPC.
>
> Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
> Suggested-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
>   2 files changed, 49 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 8bc79924d171..41926d631563 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
>   		else
>   			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>   
> -		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
> -			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>   		c++;
>   	}
>   
> @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>   {
>   	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>   	struct ttm_operation_ctx ctx = { false, false };
> +	struct ttm_place *places = bo->placements;
> +	u32 c = 0;
>   	int r, i;
>   
>   	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
> @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>   
>   	if (bo->tbo.pin_count) {
>   		uint32_t mem_type = bo->tbo.resource->mem_type;
> -		uint32_t mem_flags = bo->tbo.resource->placement;
>   
>   		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
>   			return -EINVAL;
>   
> -		if ((mem_type == TTM_PL_VRAM) &&
> -		    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
> -		    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
> -			return -EINVAL;
> -

I think that check here needs to stay.

>   		ttm_bo_pin(&bo->tbo);
>   
>   		if (max_offset != 0) {
> @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>   			bo->placements[i].lpfn = lpfn;
>   	}
>   
> +	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
> +	    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
> +		places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
> +

This needs to go into the loop directly above as something like this here:

If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
     bo->placements[i].mem_type == TTM_PL_VRAM)
         o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;

This essentially replaces the removed code in 
amdgpu_bo_placement_from_domain() and only applies it during pinning.

>   	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>   	if (unlikely(r)) {
>   		dev_err(adev->dev, "%p pin failed\n", bo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 8db880244324..ddbf302878f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
>   	return size;
>   }
>   
> +static inline unsigned long
> +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
> +				 const struct ttm_place *place,
> +				 unsigned long bo_flags)

Well I think we need a better name here. "find" usually means we look up 
something in a data structure. Maybe 
amdgpu_vram_mgr_calculate_pages_per_block.

> +{
> +	unsigned long pages_per_block;
> +
> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
> +	    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
> +		pages_per_block = ~0ul;
> +	} else {
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +		pages_per_block = HPAGE_PMD_NR;
> +#else
> +		/* default to 2MB */
> +		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
> +#endif
> +		pages_per_block = max_t(uint32_t, pages_per_block,
> +					tbo->page_alignment);
> +	}
> +
> +	return pages_per_block;
> +}
> +
>   /**
>    * DOC: mem_info_vram_total
>    *
> @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>   	u64 vis_usage = 0, max_bytes, min_block_size;
> +	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>   	struct amdgpu_vram_mgr_resource *vres;
>   	u64 size, remaining_size, lpfn, fpfn;
> +	unsigned long bo_flags = bo->flags;
>   	struct drm_buddy *mm = &mgr->mm;
>   	struct drm_buddy_block *block;
>   	unsigned long pages_per_block;
> @@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	if (tbo->type != ttm_bo_type_kernel)
>   		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>   
> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
> -		pages_per_block = ~0ul;
> -	} else {
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -		pages_per_block = HPAGE_PMD_NR;
> -#else
> -		/* default to 2MB */
> -		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
> -#endif
> -		pages_per_block = max_t(uint32_t, pages_per_block,
> -					tbo->page_alignment);
> -	}
> +	pages_per_block =
> +		amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
>   
>   	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>   	if (!vres)
> @@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   	if (place->flags & TTM_PL_FLAG_TOPDOWN)
>   		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>   
> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
> +	    place->flags & TTM_PL_FLAG_CONTIGUOUS)
>   		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>   
>   	if (fpfn || lpfn != mgr->mm.size)
> @@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>   					   min_block_size,
>   					   &vres->blocks,
>   					   vres->flags);
> -		if (unlikely(r))
> +		if (unlikely(r)) {
> +			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
> +				/* Fallback to non contiguous allocation */
> +				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
> +				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
> +
> +				pages_per_block =
> +					amdgpu_vram_find_pages_per_block(tbo,
> +									 place,
> +									 bo_flags);
> +				continue;
> +			}

Let's talk about that on our weekly call. Might be a bit overkill, but 
I'm not so deep inside the code any more.

But looks quite good actually.

Regards,
Christian.

>   			goto error_free_blocks;
> +		}
>   
>   		if (size > remaining_size)
>   			remaining_size = 0;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-15 14:03 ` Christian König
@ 2024-04-15 17:16   ` Paneer Selvam, Arunpravin
  0 siblings, 0 replies; 8+ messages in thread
From: Paneer Selvam, Arunpravin @ 2024-04-15 17:16 UTC (permalink / raw)
  To: Christian König, amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang

Hi Christian,

On 4/15/2024 7:33 PM, Christian König wrote:
> Am 14.04.24 um 16:57 schrieb Arunpravin Paneer Selvam:
>> Now we have two flags for contiguous VRAM buffer allocation.
>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>> buffer's placement function.
>>
>> This patch will change the default behaviour of the two flags.
>>
>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>> - This means contiguous is not mandatory.
>> - we will try to allocate the contiguous buffer. Say if the
>>    allocation fails, we fallback to allocate the individual pages.
>>
>> When we setTTM_PL_FLAG_CONTIGUOUS
>> - This means contiguous allocation is mandatory.
>> - we are setting this in amdgpu_bo_pin_restricted() before bo validation
>>    and check this flag in the vram manager file.
>> - if this is set, we should allocate the buffer pages contiguously.
>>    the allocation fails, we return -ENOSPC.
>>
>> Signed-off-by: Arunpravin Paneer Selvam 
>> <Arunpravin.PaneerSelvam@amd.com>
>> Suggested-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
>>   2 files changed, 49 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 8bc79924d171..41926d631563 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct 
>> amdgpu_bo *abo, u32 domain)
>>           else
>>               places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>   -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>> -            places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>           c++;
>>       }
>>   @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>> *bo, u32 domain,
>>   {
>>       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>       struct ttm_operation_ctx ctx = { false, false };
>> +    struct ttm_place *places = bo->placements;
>> +    u32 c = 0;
>>       int r, i;
>>         if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
>> @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>> *bo, u32 domain,
>>         if (bo->tbo.pin_count) {
>>           uint32_t mem_type = bo->tbo.resource->mem_type;
>> -        uint32_t mem_flags = bo->tbo.resource->placement;
>>             if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
>>               return -EINVAL;
>>   -        if ((mem_type == TTM_PL_VRAM) &&
>> -            (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
>> -            !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
>> -            return -EINVAL;
>> -
>
> I think that check here needs to stay.
>
>>           ttm_bo_pin(&bo->tbo);
>>             if (max_offset != 0) {
>> @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>> *bo, u32 domain,
>>               bo->placements[i].lpfn = lpfn;
>>       }
>>   +    if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
>> +        !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
>> +        places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>> +
>
> This needs to go into the loop directly above as something like this 
> here:
>
> If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>     bo->placements[i].mem_type == TTM_PL_VRAM)
>         o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>
> This essentially replaces the removed code in 
> amdgpu_bo_placement_from_domain() and only applies it during pinning.
>
>>       r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>       if (unlikely(r)) {
>>           dev_err(adev->dev, "%p pin failed\n", bo);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 8db880244324..ddbf302878f6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -88,6 +88,30 @@ static inline u64 
>> amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>       return size;
>>   }
>>   +static inline unsigned long
>> +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
>> +                 const struct ttm_place *place,
>> +                 unsigned long bo_flags)
>
> Well I think we need a better name here. "find" usually means we look 
> up something in a data structure. Maybe 
> amdgpu_vram_mgr_calculate_pages_per_block.
>
>> +{
>> +    unsigned long pages_per_block;
>> +
>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> +        place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> +        pages_per_block = ~0ul;
>> +    } else {
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +        pages_per_block = HPAGE_PMD_NR;
>> +#else
>> +        /* default to 2MB */
>> +        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +        pages_per_block = max_t(uint32_t, pages_per_block,
>> +                    tbo->page_alignment);
>> +    }
>> +
>> +    return pages_per_block;
>> +}
>> +
>>   /**
>>    * DOC: mem_info_vram_total
>>    *
>> @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>       u64 vis_usage = 0, max_bytes, min_block_size;
>> +    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>       struct amdgpu_vram_mgr_resource *vres;
>>       u64 size, remaining_size, lpfn, fpfn;
>> +    unsigned long bo_flags = bo->flags;
>>       struct drm_buddy *mm = &mgr->mm;
>>       struct drm_buddy_block *block;
>>       unsigned long pages_per_block;
>> @@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       if (tbo->type != ttm_bo_type_kernel)
>>           max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -        pages_per_block = ~0ul;
>> -    } else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -        pages_per_block = HPAGE_PMD_NR;
>> -#else
>> -        /* default to 2MB */
>> -        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -        pages_per_block = max_t(uint32_t, pages_per_block,
>> -                    tbo->page_alignment);
>> -    }
>> +    pages_per_block =
>> +        amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
>>         vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>       if (!vres)
>> @@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>       if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>           vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> +        place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>           vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>         if (fpfn || lpfn != mgr->mm.size)
>> @@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct 
>> ttm_resource_manager *man,
>>                          min_block_size,
>>                          &vres->blocks,
>>                          vres->flags);
>> -        if (unlikely(r))
>> +        if (unlikely(r)) {
>> +            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>> +                /* Fallback to non contiguous allocation */
>> +                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>> +                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> +
>> +                pages_per_block =
>> +                    amdgpu_vram_find_pages_per_block(tbo,
>> +                                     place,
>> +                                     bo_flags);
>> +                continue;
>> +            }
>
> Let's talk about that on our weekly call. Might be a bit overkill, but 
> I'm not so deep inside the code any more.
Sure, Thanks.

Regards,
Arun.
>
> But looks quite good actually.
>
> Regards,
> Christian.
>
>>               goto error_free_blocks;
>> +        }
>>             if (size > remaining_size)
>>               remaining_size = 0;
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-14 14:57 [PATCH] drm/amdgpu: Modify the contiguous flags behaviour Arunpravin Paneer Selvam
  2024-04-15 14:03 ` Christian König
@ 2024-04-15 22:02 ` Philip Yang
  2024-04-16  6:50   ` Paneer Selvam, Arunpravin
  2024-04-16  7:46   ` Christian König
  1 sibling, 2 replies; 8+ messages in thread
From: Philip Yang @ 2024-04-15 22:02 UTC (permalink / raw)
  To: Arunpravin Paneer Selvam, amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang

[-- Attachment #1: Type: text/html, Size: 8845 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-15 22:02 ` Philip Yang
@ 2024-04-16  6:50   ` Paneer Selvam, Arunpravin
  2024-04-16 14:30     ` Philip Yang
  2024-04-16  7:46   ` Christian König
  1 sibling, 1 reply; 8+ messages in thread
From: Paneer Selvam, Arunpravin @ 2024-04-16  6:50 UTC (permalink / raw)
  To: Philip Yang, amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang



On 4/16/2024 3:32 AM, Philip Yang wrote:
>
>
> On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:
>> Now we have two flags for contiguous VRAM buffer allocation.
>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>> buffer's placement function.
>>
>> This patch will change the default behaviour of the two flags.
> This change will simplify the KFD best effort contiguous VRAM 
> allocation, because KFD doesn't need set new GEM_ flag.
>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>> - This means contiguous is not mandatory.
>
> AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
> table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs 
> and F/W size related BOs, do all allocation happen at driver 
> initialization before the VRAM is fragmented?
>
>> - we will try to allocate the contiguous buffer. Say if the
>>    allocation fails, we fallback to allocate the individual pages.
>>
>> When we setTTM_PL_FLAG_CONTIGUOUS
>> - This means contiguous allocation is mandatory.
>> - we are setting this in amdgpu_bo_pin_restricted() before bo validation
>>    and check this flag in the vram manager file.
>> - if this is set, we should allocate the buffer pages contiguously.
>>    the allocation fails, we return -ENOSPC.
>>
>> Signed-off-by: Arunpravin Paneer Selvam<Arunpravin.PaneerSelvam@amd.com>
>> Suggested-by: Christian König<christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
>>   2 files changed, 49 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 8bc79924d171..41926d631563 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
>>   		else
>>   			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>   
>> -		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>> -			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>   		c++;
>>   	}
>>   
>> @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>>   {
>>   	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>   	struct ttm_operation_ctx ctx = { false, false };
>> +	struct ttm_place *places = bo->placements;
>> +	u32 c = 0;
>>   	int r, i;
>>   
>>   	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
>> @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>>   
>>   	if (bo->tbo.pin_count) {
>>   		uint32_t mem_type = bo->tbo.resource->mem_type;
>> -		uint32_t mem_flags = bo->tbo.resource->placement;
>>   
>>   		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
>>   			return -EINVAL;
>>   
>> -		if ((mem_type == TTM_PL_VRAM) &&
>> -		    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
>> -		    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
>> -			return -EINVAL;
>> -
> This looks like a bug before, but with this patch, the check makes 
> sense and is needed.
>>   		ttm_bo_pin(&bo->tbo);
>>   
>>   		if (max_offset != 0) {
>> @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>>   			bo->placements[i].lpfn = lpfn;
>>   	}
>>   
>> +	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
>> +	    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
>> +		places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>> +
>
> If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
> should pin and return scattered pages because the RDMA support 
> scattered dmabuf. Christian also pointed this out.
>
> If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>
>     bo->placements[i].mem_type == TTM_PL_VRAM)
>         o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>   	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>   	if (unlikely(r)) {
>>   		dev_err(adev->dev, "%p pin failed\n", bo);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 8db880244324..ddbf302878f6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>   	return size;
>>   }
>>   
>> +static inline unsigned long
>> +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
>> +				 const struct ttm_place *place,
>> +				 unsigned long bo_flags)
>> +{
>> +	unsigned long pages_per_block;
>> +
>> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> +	    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> +		pages_per_block = ~0ul;
>> +	} else {
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +		pages_per_block = HPAGE_PMD_NR;
>> +#else
>> +		/* default to 2MB */
>> +		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +		pages_per_block = max_t(uint32_t, pages_per_block,
>> +					tbo->page_alignment);
>> +	}
>> +
>> +	return pages_per_block;
>> +}
>> +
>>   /**
>>    * DOC: mem_info_vram_total
>>    *
>> @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>   	u64 vis_usage = 0, max_bytes, min_block_size;
>> +	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>   	struct amdgpu_vram_mgr_resource *vres;
>>   	u64 size, remaining_size, lpfn, fpfn;
>> +	unsigned long bo_flags = bo->flags;
>>   	struct drm_buddy *mm = &mgr->mm;
>>   	struct drm_buddy_block *block;
>>   	unsigned long pages_per_block;
>> @@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	if (tbo->type != ttm_bo_type_kernel)
>>   		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -		pages_per_block = ~0ul;
>> -	} else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -		pages_per_block = HPAGE_PMD_NR;
>> -#else
>> -		/* default to 2MB */
>> -		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -		pages_per_block = max_t(uint32_t, pages_per_block,
>> -					tbo->page_alignment);
>> -	}
>> +	pages_per_block =
>> +		amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
>>   
>>   	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>   	if (!vres)
>> @@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>   		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> +	    place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>   		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>   
>>   	if (fpfn || lpfn != mgr->mm.size)
>> @@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   					   min_block_size,
>>   					   &vres->blocks,
>>   					   vres->flags);
>> -		if (unlikely(r))
>> +		if (unlikely(r)) {
>
> If pin BO failed to allocate contiguous VRAM, this should return 
> failure to application, as RDMA cannot work with scattered VRAM pages.
>
here we can return -ENOSPC when the BO is pinned (i.e 
TTM_PL_FLAG_CONTIGUOUS is set). Please find the below modified condition,
if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
     !(place->flags & TTM_PL_FLAG_CONTIGUOUS)
Hence if the TTM_PL_FLAG_CONTIGUOUS flag is set, we don't proceed for 
allocating individual pages.
>
> Regards,
>
> Philip
>
>> +			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>> +				/* Fallback to non contiguous allocation */
>> +				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>> +				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> +
>> +				pages_per_block =
>> +					amdgpu_vram_find_pages_per_block(tbo,
>> +									 place,
>> +									 bo_flags);
>> +				continue;
>> +			}
>>   			goto error_free_blocks;
>> +		}
>>   
>>   		if (size > remaining_size)
>>   			remaining_size = 0;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-15 22:02 ` Philip Yang
  2024-04-16  6:50   ` Paneer Selvam, Arunpravin
@ 2024-04-16  7:46   ` Christian König
  2024-04-16  7:56     ` Paneer Selvam, Arunpravin
  1 sibling, 1 reply; 8+ messages in thread
From: Christian König @ 2024-04-16  7:46 UTC (permalink / raw)
  To: Philip Yang, Arunpravin Paneer Selvam, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

Am 16.04.24 um 00:02 schrieb Philip Yang:
> On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:
>> Now we have two flags for contiguous VRAM buffer allocation.
>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>> buffer's placement function.
>>
>> This patch will change the default behaviour of the two flags.
> This change will simplify the KFD best effort contiguous VRAM 
> allocation, because KFD doesn't need set new GEM_ flag.
>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>> - This means contiguous is not mandatory.
>
> AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
> table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs 
> and F/W size related BOs, do all allocation happen at driver 
> initialization before the VRAM is fragmented?
>

Oh, that's a really good point, we need to keep the behavior as is for 
kernel allocations. Arun can you take care of that?

Thanks,
Christian.

>> - we will try to allocate the contiguous buffer. Say if the
>>    allocation fails, we fallback to allocate the individual pages.
>>
>> When we setTTM_PL_FLAG_CONTIGUOUS
>> - This means contiguous allocation is mandatory.
>> - we are setting this in amdgpu_bo_pin_restricted() before bo validation
>>    and check this flag in the vram manager file.
>> - if this is set, we should allocate the buffer pages contiguously.
>>    the allocation fails, we return -ENOSPC.
>>
>> Signed-off-by: Arunpravin Paneer Selvam<Arunpravin.PaneerSelvam@amd.com>
>> Suggested-by: Christian König<christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++++++++++++++-----
>>   2 files changed, 49 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 8bc79924d171..41926d631563 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
>>   		else
>>   			places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>   
>> -		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>> -			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>   		c++;
>>   	}
>>   
>> @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>>   {
>>   	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>   	struct ttm_operation_ctx ctx = { false, false };
>> +	struct ttm_place *places = bo->placements;
>> +	u32 c = 0;
>>   	int r, i;
>>   
>>   	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
>> @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>>   
>>   	if (bo->tbo.pin_count) {
>>   		uint32_t mem_type = bo->tbo.resource->mem_type;
>> -		uint32_t mem_flags = bo->tbo.resource->placement;
>>   
>>   		if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
>>   			return -EINVAL;
>>   
>> -		if ((mem_type == TTM_PL_VRAM) &&
>> -		    (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
>> -		    !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
>> -			return -EINVAL;
>> -
> This looks like a bug before, but with this patch, the check makes 
> sense and is needed.
>>   		ttm_bo_pin(&bo->tbo);
>>   
>>   		if (max_offset != 0) {
>> @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
>>   			bo->placements[i].lpfn = lpfn;
>>   	}
>>   
>> +	if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
>> +	    !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
>> +		places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>> +
>
> If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
> should pin and return scattered pages because the RDMA support 
> scattered dmabuf. Christian also pointed this out.
>
> If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>
>     bo->placements[i].mem_type == TTM_PL_VRAM)
>         o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>   	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>   	if (unlikely(r)) {
>>   		dev_err(adev->dev, "%p pin failed\n", bo);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 8db880244324..ddbf302878f6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>   	return size;
>>   }
>>   
>> +static inline unsigned long
>> +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
>> +				 const struct ttm_place *place,
>> +				 unsigned long bo_flags)
>> +{
>> +	unsigned long pages_per_block;
>> +
>> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> +	    place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> +		pages_per_block = ~0ul;
>> +	} else {
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +		pages_per_block = HPAGE_PMD_NR;
>> +#else
>> +		/* default to 2MB */
>> +		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> +#endif
>> +		pages_per_block = max_t(uint32_t, pages_per_block,
>> +					tbo->page_alignment);
>> +	}
>> +
>> +	return pages_per_block;
>> +}
>> +
>>   /**
>>    * DOC: mem_info_vram_total
>>    *
>> @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>   	struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>   	u64 vis_usage = 0, max_bytes, min_block_size;
>> +	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>   	struct amdgpu_vram_mgr_resource *vres;
>>   	u64 size, remaining_size, lpfn, fpfn;
>> +	unsigned long bo_flags = bo->flags;
>>   	struct drm_buddy *mm = &mgr->mm;
>>   	struct drm_buddy_block *block;
>>   	unsigned long pages_per_block;
>> @@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	if (tbo->type != ttm_bo_type_kernel)
>>   		max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>> -		pages_per_block = ~0ul;
>> -	} else {
>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -		pages_per_block = HPAGE_PMD_NR;
>> -#else
>> -		/* default to 2MB */
>> -		pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>> -#endif
>> -		pages_per_block = max_t(uint32_t, pages_per_block,
>> -					tbo->page_alignment);
>> -	}
>> +	pages_per_block =
>> +		amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
>>   
>>   	vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>   	if (!vres)
>> @@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   	if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>   		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>   
>> -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>> +	if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>> +	    place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>   		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>   
>>   	if (fpfn || lpfn != mgr->mm.size)
>> @@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
>>   					   min_block_size,
>>   					   &vres->blocks,
>>   					   vres->flags);
>> -		if (unlikely(r))
>> +		if (unlikely(r)) {
>
> If pin BO failed to allocate contiguous VRAM, this should return 
> failure to application, as RDMA cannot work with scattered VRAM pages.
>
> Regards,
>
> Philip
>
>> +			if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>> +				/* Fallback to non contiguous allocation */
>> +				vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>> +				bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> +
>> +				pages_per_block =
>> +					amdgpu_vram_find_pages_per_block(tbo,
>> +									 place,
>> +									 bo_flags);
>> +				continue;
>> +			}
>>   			goto error_free_blocks;
>> +		}
>>   
>>   		if (size > remaining_size)
>>   			remaining_size = 0;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-16  7:46   ` Christian König
@ 2024-04-16  7:56     ` Paneer Selvam, Arunpravin
  0 siblings, 0 replies; 8+ messages in thread
From: Paneer Selvam, Arunpravin @ 2024-04-16  7:56 UTC (permalink / raw)
  To: Christian König, Philip Yang, amd-gfx
  Cc: alexander.deucher, felix.kuehling, philip.yang

Hi Christian,

On 4/16/2024 1:16 PM, Christian König wrote:
> Am 16.04.24 um 00:02 schrieb Philip Yang:
>> On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote:
>>> Now we have two flags for contiguous VRAM buffer allocation.
>>> If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
>>> it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the
>>> buffer's placement function.
>>>
>>> This patch will change the default behaviour of the two flags.
>> This change will simplify the KFD best effort contiguous VRAM 
>> allocation, because KFD doesn't need set new GEM_ flag.
>>> When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS
>>> - This means contiguous is not mandatory.
>>
>> AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page 
>> table BO, it is fine as BO size is page size 4K. For 64KB reserved 
>> BOs and F/W size related BOs, do all allocation happen at driver 
>> initialization before the VRAM is fragmented?
>>
>
> Oh, that's a really good point, we need to keep the behavior as is for 
> kernel allocations. Arun can you take care of that?
Sure, I will take care kernel allocations.
Thanks,
Arun.
>
> Thanks,
> Christian.
>
>>> - we will try to allocate the contiguous buffer. Say if the
>>>    allocation fails, we fallback to allocate the individual pages.
>>>
>>> When we setTTM_PL_FLAG_CONTIGUOUS
>>> - This means contiguous allocation is mandatory.
>>> - we are setting this in amdgpu_bo_pin_restricted() before bo 
>>> validation
>>>    and check this flag in the vram manager file.
>>> - if this is set, we should allocate the buffer pages contiguously.
>>>    the allocation fails, we return -ENOSPC.
>>>
>>> Signed-off-by: Arunpravin Paneer 
>>> Selvam<Arunpravin.PaneerSelvam@amd.com>
>>> Suggested-by: Christian König<christian.koenig@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c   | 14 +++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 
>>> +++++++++++++++-----
>>>   2 files changed, 49 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 8bc79924d171..41926d631563 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct 
>>> amdgpu_bo *abo, u32 domain)
>>>           else
>>>               places[c].flags |= TTM_PL_FLAG_TOPDOWN;
>>>   -        if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
>>> -            places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>           c++;
>>>       }
>>>   @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>   {
>>>       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>       struct ttm_operation_ctx ctx = { false, false };
>>> +    struct ttm_place *places = bo->placements;
>>> +    u32 c = 0;
>>>       int r, i;
>>>         if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
>>> @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>         if (bo->tbo.pin_count) {
>>>           uint32_t mem_type = bo->tbo.resource->mem_type;
>>> -        uint32_t mem_flags = bo->tbo.resource->placement;
>>>             if (!(domain & amdgpu_mem_type_to_domain(mem_type)))
>>>               return -EINVAL;
>>>   -        if ((mem_type == TTM_PL_VRAM) &&
>>> -            (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) &&
>>> -            !(mem_flags & TTM_PL_FLAG_CONTIGUOUS))
>>> -            return -EINVAL;
>>> -
>> This looks like a bug before, but with this patch, the check makes 
>> sense and is needed.
>>>           ttm_bo_pin(&bo->tbo);
>>>             if (max_offset != 0) {
>>> @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
>>> *bo, u32 domain,
>>>               bo->placements[i].lpfn = lpfn;
>>>       }
>>>   +    if (domain & AMDGPU_GEM_DOMAIN_VRAM &&
>>> +        !WARN_ON(places[c].mem_type != TTM_PL_VRAM))
>>> +        places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>> +
>>
>> If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, 
>> should pin and return scattered pages because the RDMA support 
>> scattered dmabuf. Christian also pointed this out.
>>
>> If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
>>
>>     bo->placements[i].mem_type == TTM_PL_VRAM)
>>         o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
>>>       r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>       if (unlikely(r)) {
>>>           dev_err(adev->dev, "%p pin failed\n", bo);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> index 8db880244324..ddbf302878f6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>>> @@ -88,6 +88,30 @@ static inline u64 
>>> amdgpu_vram_mgr_blocks_size(struct list_head *head)
>>>       return size;
>>>   }
>>>   +static inline unsigned long
>>> +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo,
>>> +                 const struct ttm_place *place,
>>> +                 unsigned long bo_flags)
>>> +{
>>> +    unsigned long pages_per_block;
>>> +
>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>>> +        place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> +        pages_per_block = ~0ul;
>>> +    } else {
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +        pages_per_block = HPAGE_PMD_NR;
>>> +#else
>>> +        /* default to 2MB */
>>> +        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>> +#endif
>>> +        pages_per_block = max_t(uint32_t, pages_per_block,
>>> +                    tbo->page_alignment);
>>> +    }
>>> +
>>> +    return pages_per_block;
>>> +}
>>> +
>>>   /**
>>>    * DOC: mem_info_vram_total
>>>    *
>>> @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
>>>       struct amdgpu_device *adev = to_amdgpu_device(mgr);
>>>       u64 vis_usage = 0, max_bytes, min_block_size;
>>> +    struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
>>>       struct amdgpu_vram_mgr_resource *vres;
>>>       u64 size, remaining_size, lpfn, fpfn;
>>> +    unsigned long bo_flags = bo->flags;
>>>       struct drm_buddy *mm = &mgr->mm;
>>>       struct drm_buddy_block *block;
>>>       unsigned long pages_per_block;
>>> @@ -468,18 +494,8 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       if (tbo->type != ttm_bo_type_kernel)
>>>           max_bytes -= AMDGPU_VM_RESERVED_VRAM;
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
>>> -        pages_per_block = ~0ul;
>>> -    } else {
>>> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -        pages_per_block = HPAGE_PMD_NR;
>>> -#else
>>> -        /* default to 2MB */
>>> -        pages_per_block = 2UL << (20UL - PAGE_SHIFT);
>>> -#endif
>>> -        pages_per_block = max_t(uint32_t, pages_per_block,
>>> -                    tbo->page_alignment);
>>> -    }
>>> +    pages_per_block =
>>> +        amdgpu_vram_find_pages_per_block(tbo, place, bo_flags);
>>>         vres = kzalloc(sizeof(*vres), GFP_KERNEL);
>>>       if (!vres)
>>> @@ -498,7 +514,8 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>       if (place->flags & TTM_PL_FLAG_TOPDOWN)
>>>           vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
>>>   -    if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>> +    if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS ||
>>> +        place->flags & TTM_PL_FLAG_CONTIGUOUS)
>>>           vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>>         if (fpfn || lpfn != mgr->mm.size)
>>> @@ -529,8 +546,20 @@ static int amdgpu_vram_mgr_new(struct 
>>> ttm_resource_manager *man,
>>>                          min_block_size,
>>>                          &vres->blocks,
>>>                          vres->flags);
>>> -        if (unlikely(r))
>>> +        if (unlikely(r)) {
>>
>> If pin BO failed to allocate contiguous VRAM, this should return 
>> failure to application, as RDMA cannot work with scattered VRAM pages.
>>
>> Regards,
>>
>> Philip
>>
>>> +            if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
>>> +                /* Fallback to non contiguous allocation */
>>> +                vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
>>> +                bo_flags &= ~AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>> +
>>> +                pages_per_block =
>>> +                    amdgpu_vram_find_pages_per_block(tbo,
>>> +                                     place,
>>> +                                     bo_flags);
>>> +                continue;
>>> +            }
>>>               goto error_free_blocks;
>>> +        }
>>>             if (size > remaining_size)
>>>               remaining_size = 0;
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
  2024-04-16  6:50   ` Paneer Selvam, Arunpravin
@ 2024-04-16 14:30     ` Philip Yang
  0 siblings, 0 replies; 8+ messages in thread
From: Philip Yang @ 2024-04-16 14:30 UTC (permalink / raw)
  To: Paneer Selvam, Arunpravin, amd-gfx
  Cc: christian.koenig, alexander.deucher, felix.kuehling, philip.yang

[-- Attachment #1: Type: text/html, Size: 18775 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-04-16 14:30 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-14 14:57 [PATCH] drm/amdgpu: Modify the contiguous flags behaviour Arunpravin Paneer Selvam
2024-04-15 14:03 ` Christian König
2024-04-15 17:16   ` Paneer Selvam, Arunpravin
2024-04-15 22:02 ` Philip Yang
2024-04-16  6:50   ` Paneer Selvam, Arunpravin
2024-04-16 14:30     ` Philip Yang
2024-04-16  7:46   ` Christian König
2024-04-16  7:56     ` Paneer Selvam, Arunpravin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.