* [PATCH] drm/amdgpu: change the fence ring wait timeout
@ 2021-01-13 6:36 Roy Sun
2021-01-13 7:05 ` Deng, Emily
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Roy Sun @ 2021-01-13 6:36 UTC (permalink / raw)
To: amd-gfx; +Cc: Roy Sun
This fix bug where when the engine hang, the fence ring will wait without quit and cause kernel crash
Signed-off-by: Roy Sun <Roy.Sun@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48 ++++++++++++++++++++---
1 file changed, 43 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 6b0aeee61b8b..738ea65077ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -41,6 +41,8 @@
#include "amdgpu.h"
#include "amdgpu_trace.h"
+#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000)
+#define AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
/*
* Fences
* Fences mark an event in the GPUs pipeline and are used
@@ -104,6 +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32 seq)
*drv->cpu_addr = cpu_to_le32(seq);
}
+/**
+ * amdgpu_fence_wait_timeout - get the fence wait timeout
+ *
+ * @ring: ring the fence is associated with
+ *
+ * Returns the value of the fence wait timeout.
+ */
+long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring)
+{
+ long tmo_gfx, tmo_mm, tmo;
+ struct amdgpu_device *adev = ring->adev;
+ tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
+ if (amdgpu_sriov_vf(adev)) {
+ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
+ }
+ if (amdgpu_sriov_runtime(adev)) {
+ tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
+ } else if (adev->gmc.xgmi.hive_id) {
+ tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
+ }
+ if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
+ ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
+ ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
+ ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
+ ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
+ ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
+ tmo = tmo_mm;
+ else
+ tmo = tmo_gfx;
+ return tmo;
+}
+
/**
* amdgpu_fence_read - read a fence value
*
@@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
rcu_read_unlock();
if (old) {
- r = dma_fence_wait(old, false);
+ long timeout;
+ timeout = amdgpu_fence_wait_timeout(ring);
+ r = dma_fence_wait_timeout(old, false, timeout);
dma_fence_put(old);
if (r)
- return r;
+ return r < 0 ? r : 0;
}
}
@@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
return 0;
}
rcu_read_unlock();
-
- r = dma_fence_wait(fence, false);
+
+ long timeout;
+ timeout = amdgpu_fence_wait_timeout(ring);
+ r = dma_fence_wait_timeout(fence, false, timeout);
dma_fence_put(fence);
- return r;
+ return r < 0 ? r : 0;
}
/**
--
2.28.0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* RE: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-13 6:36 [PATCH] drm/amdgpu: change the fence ring wait timeout Roy Sun
@ 2021-01-13 7:05 ` Deng, Emily
2021-01-13 7:57 ` Paul Menzel
2021-01-13 14:03 ` Christian König
2 siblings, 0 replies; 12+ messages in thread
From: Deng, Emily @ 2021-01-13 7:05 UTC (permalink / raw)
To: Sun, Roy, amd-gfx@lists.freedesktop.org; +Cc: Sun, Roy
[AMD Official Use Only - Internal Distribution Only]
>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Roy
>Sun
>Sent: Wednesday, January 13, 2021 2:36 PM
>To: amd-gfx@lists.freedesktop.org
>Cc: Sun, Roy <Roy.Sun@amd.com>
>Subject: [PATCH] drm/amdgpu: change the fence ring wait timeout
>
>This fix bug where when the engine hang, the fence ring will wait without quit
>and cause kernel crash
>
>Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>++++++++++++++++++++---
> 1 file changed, 43 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>index 6b0aeee61b8b..738ea65077ea 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>@@ -41,6 +41,8 @@
> #include "amdgpu.h"
> #include "amdgpu_trace.h"
>
>+#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>+AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
Please check the format.
> /*
> * Fences
> * Fences mark an event in the GPUs pipeline and are used @@ -104,6
>+106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32
>seq)
> *drv->cpu_addr = cpu_to_le32(seq);
> }
>
>+/**
>+ * amdgpu_fence_wait_timeout - get the fence wait timeout
>+ *
>+ * @ring: ring the fence is associated with
>+ *
>+ * Returns the value of the fence wait timeout.
>+ */
>+long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) {
>+long tmo_gfx, tmo_mm, tmo;
>+struct amdgpu_device *adev = ring->adev;
>+tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
>+if (amdgpu_sriov_vf(adev)) {
>+tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
>+}
>+if (amdgpu_sriov_runtime(adev)) {
>+tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
>+} else if (adev->gmc.xgmi.hive_id) {
>+tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
>+}
>+if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
>+ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
>+ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
>+ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
>+ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
>+ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
>+tmo = tmo_mm;
>+else
>+tmo = tmo_gfx;
>+return tmo;
>+}
>+
> /**
> * amdgpu_fence_read - read a fence value
> *
>@@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring,
>struct dma_fence **f,
> rcu_read_unlock();
>
> if (old) {
>-r = dma_fence_wait(old, false);
>+long timeout;
>+timeout = amdgpu_fence_wait_timeout(ring);
>+r = dma_fence_wait_timeout(old, false, timeout);
> dma_fence_put(old);
> if (r)
>-return r;
>+return r < 0 ? r : 0;
> }
> }
>
>@@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct amdgpu_ring
>*ring)
> return 0;
> }
> rcu_read_unlock();
>-
>-r = dma_fence_wait(fence, false);
>+
>+long timeout;
>+timeout = amdgpu_fence_wait_timeout(ring);
>+r = dma_fence_wait_timeout(fence, false, timeout);
> dma_fence_put(fence);
>-return r;
>+return r < 0 ? r : 0;
> }
>
> /**
>--
>2.28.0
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.f
>reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C2df180bf7aaa419f
>387708d8b78d9dfd%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>C637461166168063291%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>ta=2n3ZbLF0ZT%2FakzxlcBI3c%2F9A0F8EIzvKzDj2OPA0DmE%3D&reserve
>d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-13 6:36 [PATCH] drm/amdgpu: change the fence ring wait timeout Roy Sun
2021-01-13 7:05 ` Deng, Emily
@ 2021-01-13 7:57 ` Paul Menzel
2021-01-13 14:03 ` Christian König
2 siblings, 0 replies; 12+ messages in thread
From: Paul Menzel @ 2021-01-13 7:57 UTC (permalink / raw)
To: Roy Sun; +Cc: amd-gfx
Dear Roy,
Am 13.01.21 um 07:36 schrieb Roy Sun:
> This fix bug where when the engine hang, the fence ring will wait without quit and cause kernel crash
Please wrap commit message body after 75 characters. (checkpatch.pl
should have shown you that.)
Some typos:
1. fix*es*
2. hang*s*
3. Please add a dot/period at the end of sentences.
If there is a Linux kernel crash, please add the trace.
Please elaborate also. Where and why does the Linux kernel crash, and
how does your patch fix this.
How can the bug be reproduced? On what hardware did you see it?
> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48 ++++++++++++++++++++---
> 1 file changed, 43 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 6b0aeee61b8b..738ea65077ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -41,6 +41,8 @@
> #include "amdgpu.h"
> #include "amdgpu_trace.h"
>
> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000)
> +#define AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
Where did you get the values from, or why did you choose them.
> /*
> * Fences
> * Fences mark an event in the GPUs pipeline and are used
> @@ -104,6 +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32 seq)
> *drv->cpu_addr = cpu_to_le32(seq);
> }
>
> +/**
> + * amdgpu_fence_wait_timeout - get the fence wait timeout
> + *
> + * @ring: ring the fence is associated with
> + *
> + * Returns the value of the fence wait timeout.
> + */
> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring)
See below, but it should be `signed long`.
> +{
> + long tmo_gfx, tmo_mm, tmo;
> + struct amdgpu_device *adev = ring->adev;
> + tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
> + if (amdgpu_sriov_vf(adev)) {
> + tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
> + }
See coding style. No {} needed for one line statements.
> + if (amdgpu_sriov_runtime(adev)) {
> + tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
So why not define a special SRIOV macro too. But add a comment, why it
should be eight times that.
> + } else if (adev->gmc.xgmi.hive_id) {
> + tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
> + }
Ditto.
> + if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
> + ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
Align on `ring`?
> + tmo = tmo_mm;
> + else
> + tmo = tmo_gfx;
> + return tmo;
The return could happen in the branches.
> +}
> +
> /**
> * amdgpu_fence_read - read a fence value
> *
> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> rcu_read_unlock();
>
> if (old) {
> - r = dma_fence_wait(old, false);
> + long timeout;
> + timeout = amdgpu_fence_wait_timeout(ring);
> + r = dma_fence_wait_timeout(old, false, timeout);
The signature is `signed long` as far as I see.
> dma_fence_put(old);
> if (r)
> - return r;
> + return r < 0 ? r : 0;
> }
> }
>
> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
> return 0;
> }
> rcu_read_unlock();
> -
> - r = dma_fence_wait(fence, false);
> +
> + long timeout;
> + timeout = amdgpu_fence_wait_timeout(ring);
> + r = dma_fence_wait_timeout(fence, false, timeout);
> dma_fence_put(fence);
> - return r;
> + return r < 0 ? r : 0;
> }
>
> /**
>
It looks like this is your first patch submission, so welcome to the
Linux kernel community. Please keep in mind, that reviewer time is
precious and scarce, and that formal and simple mistakes should not
happen. No idea, if AMD has some kind of “onboarding” process for their
newly employed Linux kernel contributors.
I am looking forward to your the patch iteration and your next
contributions.
Kind regards,
Paul
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-13 6:36 [PATCH] drm/amdgpu: change the fence ring wait timeout Roy Sun
2021-01-13 7:05 ` Deng, Emily
2021-01-13 7:57 ` Paul Menzel
@ 2021-01-13 14:03 ` Christian König
2021-01-14 2:00 ` Deng, Emily
2 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2021-01-13 14:03 UTC (permalink / raw)
To: Roy Sun, amd-gfx
Am 13.01.21 um 07:36 schrieb Roy Sun:
> This fix bug where when the engine hang, the fence ring will wait without quit and cause kernel crash
NAK, this blocking is intentional unlimited because otherwise we will
cause a memory corruption.
What is the actual bug you are trying to fix here?
Regards,
Christian.
>
> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48 ++++++++++++++++++++---
> 1 file changed, 43 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 6b0aeee61b8b..738ea65077ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -41,6 +41,8 @@
> #include "amdgpu.h"
> #include "amdgpu_trace.h"
>
> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000)
> +#define AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
> /*
> * Fences
> * Fences mark an event in the GPUs pipeline and are used
> @@ -104,6 +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32 seq)
> *drv->cpu_addr = cpu_to_le32(seq);
> }
>
> +/**
> + * amdgpu_fence_wait_timeout - get the fence wait timeout
> + *
> + * @ring: ring the fence is associated with
> + *
> + * Returns the value of the fence wait timeout.
> + */
> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring)
> +{
> + long tmo_gfx, tmo_mm, tmo;
> + struct amdgpu_device *adev = ring->adev;
> + tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
> + if (amdgpu_sriov_vf(adev)) {
> + tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
> + }
> + if (amdgpu_sriov_runtime(adev)) {
> + tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
> + } else if (adev->gmc.xgmi.hive_id) {
> + tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
> + }
> + if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
> + ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
> + ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
> + tmo = tmo_mm;
> + else
> + tmo = tmo_gfx;
> + return tmo;
> +}
> +
> /**
> * amdgpu_fence_read - read a fence value
> *
> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> rcu_read_unlock();
>
> if (old) {
> - r = dma_fence_wait(old, false);
> + long timeout;
> + timeout = amdgpu_fence_wait_timeout(ring);
> + r = dma_fence_wait_timeout(old, false, timeout);
> dma_fence_put(old);
> if (r)
> - return r;
> + return r < 0 ? r : 0;
> }
> }
>
> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
> return 0;
> }
> rcu_read_unlock();
> -
> - r = dma_fence_wait(fence, false);
> +
> + long timeout;
> + timeout = amdgpu_fence_wait_timeout(ring);
> + r = dma_fence_wait_timeout(fence, false, timeout);
> dma_fence_put(fence);
> - return r;
> + return r < 0 ? r : 0;
> }
>
> /**
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-13 14:03 ` Christian König
@ 2021-01-14 2:00 ` Deng, Emily
2021-01-14 13:49 ` Christian König
0 siblings, 1 reply; 12+ messages in thread
From: Deng, Emily @ 2021-01-14 2:00 UTC (permalink / raw)
To: Koenig, Christian, Sun, Roy, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - Internal Distribution Only]
>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>Christian König
>Sent: Wednesday, January 13, 2021 10:03 PM
>To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>
>Am 13.01.21 um 07:36 schrieb Roy Sun:
>> This fix bug where when the engine hang, the fence ring will wait
>> without quit and cause kernel crash
>
>NAK, this blocking is intentional unlimited because otherwise we will cause a
>memory corruption.
>
>What is the actual bug you are trying to fix here?
When some engine hang during initialization, the IB test will fail, and fence will never come back, then never could wait the fence back. Why we need to wait here forever? We'd better not use forever wait which
will cause kernel crash and lockup. And we have amdgpu_fence_driver_force_completion will let memory free. We should remove all those forever wait, and replace them with one timeout, and
do the correct error process if timeout.
>
>Regards,
>Christian.
>
>>
>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>++++++++++++++++++++---
>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 6b0aeee61b8b..738ea65077ea 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -41,6 +41,8 @@
>> #include "amdgpu.h"
>> #include "amdgpu_trace.h"
>>
>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>> /*
>> * Fences
>> * Fences mark an event in the GPUs pipeline and are used @@ -104,6
>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32
>seq)
>> *drv->cpu_addr = cpu_to_le32(seq);
>> }
>>
>> +/**
>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>> + *
>> + * @ring: ring the fence is associated with
>> + *
>> + * Returns the value of the fence wait timeout.
>> + */
>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) {
>> +long tmo_gfx, tmo_mm, tmo;
>> +struct amdgpu_device *adev = ring->adev;
>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
>> +if (amdgpu_sriov_vf(adev)) {
>> +tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
>> +}
>> +if (amdgpu_sriov_runtime(adev)) {
>> +tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
>> +} else if (adev->gmc.xgmi.hive_id) {
>> +tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
>> +}
>> +if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
>> +ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
>> +ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
>> +ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
>> +ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
>> +tmo = tmo_mm;
>> +else
>> +tmo = tmo_gfx;
>> +return tmo;
>> +}
>> +
>> /**
>> * amdgpu_fence_read - read a fence value
>> *
>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring,
>struct dma_fence **f,
>> rcu_read_unlock();
>>
>> if (old) {
>> -r = dma_fence_wait(old, false);
>> +long timeout;
>> +timeout = amdgpu_fence_wait_timeout(ring);
>> +r = dma_fence_wait_timeout(old, false, timeout);
>> dma_fence_put(old);
>> if (r)
>> -return r;
>> +return r < 0 ? r : 0;
>> }
>> }
>>
>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>amdgpu_ring *ring)
>> return 0;
>> }
>> rcu_read_unlock();
>> -
>> -r = dma_fence_wait(fence, false);
>> +
>> +long timeout;
>> +timeout = amdgpu_fence_wait_timeout(ring);
>> +r = dma_fence_wait_timeout(fence, false, timeout);
>> dma_fence_put(fence);
>> -return r;
>> +return r < 0 ? r : 0;
>> }
>>
>> /**
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.f
>reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-14 2:00 ` Deng, Emily
@ 2021-01-14 13:49 ` Christian König
2021-01-18 2:01 ` Deng, Emily
0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2021-01-14 13:49 UTC (permalink / raw)
To: Deng, Emily, Sun, Roy, amd-gfx@lists.freedesktop.org
Am 14.01.21 um 03:00 schrieb Deng, Emily:
> [AMD Official Use Only - Internal Distribution Only]
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Wednesday, January 13, 2021 10:03 PM
>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>
>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>> This fix bug where when the engine hang, the fence ring will wait
>>> without quit and cause kernel crash
>> NAK, this blocking is intentional unlimited because otherwise we will cause a
>> memory corruption.
>>
>> What is the actual bug you are trying to fix here?
> When some engine hang during initialization, the IB test will fail, and fence will never come back, then never could wait the fence back. Why we need to wait here forever? We'd better not use forever wait which
> will cause kernel crash and lockup. And we have amdgpu_fence_driver_force_completion will let memory free. We should remove all those forever wait, and replace them with one timeout, and
> do the correct error process if timeout.
This wait here is to make sure we never overwrite the software fence
ring buffer. Without it we would not signal all fences in
amdgpu_fence_driver_force_completion() and cause either memory leak or
corruption.
Waiting here forever is the right thing to do even when that means that
the submission thread gets stuck forever, cause that is still better
than memory corruption.
But this should never happen in practice and is only here for
precaution. So do you really see that in practice?
Regards,
Christian.
>
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>> ++++++++++++++++++++---
>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 6b0aeee61b8b..738ea65077ea 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -41,6 +41,8 @@
>>> #include "amdgpu.h"
>>> #include "amdgpu_trace.h"
>>>
>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>> /*
>>> * Fences
>>> * Fences mark an event in the GPUs pipeline and are used @@ -104,6
>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring, u32
>> seq)
>>> *drv->cpu_addr = cpu_to_le32(seq);
>>> }
>>>
>>> +/**
>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>> + *
>>> + * @ring: ring the fence is associated with
>>> + *
>>> + * Returns the value of the fence wait timeout.
>>> + */
>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) {
>>> +long tmo_gfx, tmo_mm, tmo;
>>> +struct amdgpu_device *adev = ring->adev;
>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT;
>>> +if (amdgpu_sriov_vf(adev)) {
>>> +tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT;
>>> +}
>>> +if (amdgpu_sriov_runtime(adev)) {
>>> +tmo_gfx = 8 * AMDGPU_FENCE_TIMEOUT;
>>> +} else if (adev->gmc.xgmi.hive_id) {
>>> +tmo_gfx = AMDGPU_FENCE_GFX_XGMI_TIMEOUT;
>>> +}
>>> +if (ring->funcs->type == AMDGPU_RING_TYPE_UVD ||
>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE ||
>>> +ring->funcs->type == AMDGPU_RING_TYPE_UVD_ENC ||
>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCN_DEC ||
>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC ||
>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG)
>>> +tmo = tmo_mm;
>>> +else
>>> +tmo = tmo_gfx;
>>> +return tmo;
>>> +}
>>> +
>>> /**
>>> * amdgpu_fence_read - read a fence value
>>> *
>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring,
>> struct dma_fence **f,
>>> rcu_read_unlock();
>>>
>>> if (old) {
>>> -r = dma_fence_wait(old, false);
>>> +long timeout;
>>> +timeout = amdgpu_fence_wait_timeout(ring);
>>> +r = dma_fence_wait_timeout(old, false, timeout);
>>> dma_fence_put(old);
>>> if (r)
>>> -return r;
>>> +return r < 0 ? r : 0;
>>> }
>>> }
>>>
>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>> amdgpu_ring *ring)
>>> return 0;
>>> }
>>> rcu_read_unlock();
>>> -
>>> -r = dma_fence_wait(fence, false);
>>> +
>>> +long timeout;
>>> +timeout = amdgpu_fence_wait_timeout(ring);
>>> +r = dma_fence_wait_timeout(fence, false, timeout);
>>> dma_fence_put(fence);
>>> -return r;
>>> +return r < 0 ? r : 0;
>>> }
>>>
>>> /**
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.f
>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>> df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>> C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>> ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>> d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-14 13:49 ` Christian König
@ 2021-01-18 2:01 ` Deng, Emily
2021-01-18 7:48 ` Christian König
0 siblings, 1 reply; 12+ messages in thread
From: Deng, Emily @ 2021-01-18 2:01 UTC (permalink / raw)
To: Koenig, Christian, Sun, Roy, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - Internal Distribution Only]
>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Thursday, January 14, 2021 9:50 PM
>To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>
>Am 14.01.21 um 03:00 schrieb Deng, Emily:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Christian König
>>> Sent: Wednesday, January 13, 2021 10:03 PM
>>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>
>>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>>> This fix bug where when the engine hang, the fence ring will wait
>>>> without quit and cause kernel crash
>>> NAK, this blocking is intentional unlimited because otherwise we will
>>> cause a memory corruption.
>>>
>>> What is the actual bug you are trying to fix here?
>> When some engine hang during initialization, the IB test will fail,
>> and fence will never come back, then never could wait the fence back.
>> Why we need to wait here forever? We'd better not use forever wait which
>will cause kernel crash and lockup. And we have
>amdgpu_fence_driver_force_completion will let memory free. We should
>remove all those forever wait, and replace them with one timeout, and do
>the correct error process if timeout.
>
>This wait here is to make sure we never overwrite the software fence ring
>buffer. Without it we would not signal all fences in
>amdgpu_fence_driver_force_completion() and cause either memory leak or
>corruption.
>
>Waiting here forever is the right thing to do even when that means that the
>submission thread gets stuck forever, cause that is still better than memory
>corruption.
>
>But this should never happen in practice and is only here for precaution. So do
>you really see that in practice?
Yes, we hit the issue when vcn ib test fail. Could you give some suggestions about how to fix this?
[ 958.301685] failed to read reg:1a6c0
[ 959.036645] gmc_v10_0_process_interrupt: 42 callbacks suppressed
[ 959.036653] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.038043] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.039014] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.040202] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.041174] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.042353] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.043325] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.044508] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.045480] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.046659] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.047631] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.048815] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.049787] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.050973] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 959.051950] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
[ 959.053123] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
[ 967.208705] amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on vcn_enc0 (-110).
[ 967.209879] [drm:amdgpu_device_init [amdgpu]] *ERROR* ib ring test failed (-110).
[ 1209.384668] INFO: task modprobe:23957 blocked for more than 120 seconds.
[ 1209.385605] Tainted: G OE 5.4.0-58-generic #64~18.04.1-Ubuntu
[ 1209.386451] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1209.387342] modprobe D 0 23957 1221 0x80004006
[ 1209.387344] Call Trace:
[ 1209.387354] __schedule+0x293/0x720
[ 1209.387356] schedule+0x33/0xa0
[ 1209.387357] schedule_timeout+0x1d3/0x320
[ 1209.387359] ? schedule+0x33/0xa0
[ 1209.387360] ? schedule_timeout+0x1d3/0x320
[ 1209.387363] dma_fence_default_wait+0x1b2/0x1e0
[ 1209.387364] ? dma_fence_release+0x130/0x130
[ 1209.387366] dma_fence_wait_timeout+0xfd/0x110
[ 1209.387773] amdgpu_fence_wait_empty+0x90/0xc0 [amdgpu]
[ 1209.387839] amdgpu_fence_driver_fini+0xd6/0x110 [amdgpu]
>
>Regards,
>Christian.
>
>>
>>> Regards,
>>> Christian.
>>>
>>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>>> ---
>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>>> ++++++++++++++++++++---
>>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> index 6b0aeee61b8b..738ea65077ea 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> @@ -41,6 +41,8 @@
>>>> #include "amdgpu.h"
>>>> #include "amdgpu_trace.h"
>>>>
>>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>>> /*
>>>> * Fences
>>>> * Fences mark an event in the GPUs pipeline and are used @@
>>>> -104,6
>>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring,
>>>> +u32
>>> seq)
>>>> *drv->cpu_addr = cpu_to_le32(seq);
>>>> }
>>>>
>>>> +/**
>>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>>> + *
>>>> + * @ring: ring the fence is associated with
>>>> + *
>>>> + * Returns the value of the fence wait timeout.
>>>> + */
>>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) { long
>>>> +tmo_gfx, tmo_mm, tmo; struct amdgpu_device *adev = ring->adev;
>>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT; if
>(amdgpu_sriov_vf(adev))
>>>> +{ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT; } if
>>>> +(amdgpu_sriov_runtime(adev)) { tmo_gfx = 8 *
>AMDGPU_FENCE_TIMEOUT;
>>>> +} else if (adev->gmc.xgmi.hive_id) { tmo_gfx =
>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT; } if (ring->funcs->type ==
>>>> +AMDGPU_RING_TYPE_UVD ||
>>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE || type ==
>>>> +ring->funcs->AMDGPU_RING_TYPE_UVD_ENC || type ==
>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_DEC || type ==
>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_ENC || type ==
>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_JPEG)
>>>> +tmo = tmo_mm;
>>>> +else
>>>> +tmo = tmo_gfx;
>>>> +return tmo;
>>>> +}
>>>> +
>>>> /**
>>>> * amdgpu_fence_read - read a fence value
>>>> *
>>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring
>>>> *ring,
>>> struct dma_fence **f,
>>>> rcu_read_unlock();
>>>>
>>>> if (old) {
>>>> -r = dma_fence_wait(old, false);
>>>> +long timeout;
>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>> +dma_fence_wait_timeout(old, false, timeout);
>>>> dma_fence_put(old);
>>>> if (r)
>>>> -return r;
>>>> +return r < 0 ? r : 0;
>>>> }
>>>> }
>>>>
>>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>>> amdgpu_ring *ring)
>>>> return 0;
>>>> }
>>>> rcu_read_unlock();
>>>> -
>>>> -r = dma_fence_wait(fence, false);
>>>> +
>>>> +long timeout;
>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>> +dma_fence_wait_timeout(fence, false, timeout);
>>>> dma_fence_put(fence);
>>>> -return r;
>>>> +return r < 0 ? r : 0;
>>>> }
>>>>
>>>> /**
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>>> ts.f
>>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>>>
>gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>>>
>df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>>>
>C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>>>
>MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>>>
>ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>>> d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-18 2:01 ` Deng, Emily
@ 2021-01-18 7:48 ` Christian König
2021-01-18 11:56 ` Deng, Emily
0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2021-01-18 7:48 UTC (permalink / raw)
To: Deng, Emily, Sun, Roy, amd-gfx@lists.freedesktop.org
Mhm, we could change amdgpu_fence_wait_empty() to timeout. But I think
that waiting forever here is intentional and the right thing to do.
What happens is that we wait for the hardware to make sure that nothing
is writing to any memory before we unload the driver.
Now the VCN block has crashed and doesn't respond, but we can't
guarantee that it is not accidentally writing anywhere.
The only alternative we have is to time out and proceed with the driver
unload, risking corrupting the memory we free during that should the
hardware continue to do something.
Regards,
Christian.
Am 18.01.21 um 03:01 schrieb Deng, Emily:
> [AMD Official Use Only - Internal Distribution Only]
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Thursday, January 14, 2021 9:50 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>
>> Am 14.01.21 um 03:00 schrieb Deng, Emily:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Christian König
>>>> Sent: Wednesday, January 13, 2021 10:03 PM
>>>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>
>>>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>>>> This fix bug where when the engine hang, the fence ring will wait
>>>>> without quit and cause kernel crash
>>>> NAK, this blocking is intentional unlimited because otherwise we will
>>>> cause a memory corruption.
>>>>
>>>> What is the actual bug you are trying to fix here?
>>> When some engine hang during initialization, the IB test will fail,
>>> and fence will never come back, then never could wait the fence back.
>>> Why we need to wait here forever? We'd better not use forever wait which
>> will cause kernel crash and lockup. And we have
>> amdgpu_fence_driver_force_completion will let memory free. We should
>> remove all those forever wait, and replace them with one timeout, and do
>> the correct error process if timeout.
>>
>> This wait here is to make sure we never overwrite the software fence ring
>> buffer. Without it we would not signal all fences in
>> amdgpu_fence_driver_force_completion() and cause either memory leak or
>> corruption.
>>
>> Waiting here forever is the right thing to do even when that means that the
>> submission thread gets stuck forever, cause that is still better than memory
>> corruption.
>>
>> But this should never happen in practice and is only here for precaution. So do
>> you really see that in practice?
> Yes, we hit the issue when vcn ib test fail. Could you give some suggestions about how to fix this?
> [ 958.301685] failed to read reg:1a6c0
>
> [ 959.036645] gmc_v10_0_process_interrupt: 42 callbacks suppressed
>
> [ 959.036653] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.038043] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.039014] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.040202] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.041174] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.042353] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.043325] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.044508] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.045480] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.046659] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.047631] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.048815] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.049787] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.050973] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 959.051950] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0 ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>
> [ 959.053123] amdgpu 0000:00:07.0: in page starting at address 0x0000000000567000 from client 18
>
> [ 967.208705] amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on vcn_enc0 (-110).
>
> [ 967.209879] [drm:amdgpu_device_init [amdgpu]] *ERROR* ib ring test failed (-110).
>
>
>
> [ 1209.384668] INFO: task modprobe:23957 blocked for more than 120 seconds.
>
> [ 1209.385605] Tainted: G OE 5.4.0-58-generic #64~18.04.1-Ubuntu
>
> [ 1209.386451] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>
> [ 1209.387342] modprobe D 0 23957 1221 0x80004006
>
> [ 1209.387344] Call Trace:
>
> [ 1209.387354] __schedule+0x293/0x720
>
> [ 1209.387356] schedule+0x33/0xa0
>
> [ 1209.387357] schedule_timeout+0x1d3/0x320
>
> [ 1209.387359] ? schedule+0x33/0xa0
>
> [ 1209.387360] ? schedule_timeout+0x1d3/0x320
>
> [ 1209.387363] dma_fence_default_wait+0x1b2/0x1e0
>
> [ 1209.387364] ? dma_fence_release+0x130/0x130
>
> [ 1209.387366] dma_fence_wait_timeout+0xfd/0x110
>
> [ 1209.387773] amdgpu_fence_wait_empty+0x90/0xc0 [amdgpu]
>
> [ 1209.387839] amdgpu_fence_driver_fini+0xd6/0x110 [amdgpu]
>> Regards,
>> Christian.
>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>>>> ---
>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>>>> ++++++++++++++++++++---
>>>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> index 6b0aeee61b8b..738ea65077ea 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> @@ -41,6 +41,8 @@
>>>>> #include "amdgpu.h"
>>>>> #include "amdgpu_trace.h"
>>>>>
>>>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>>>> /*
>>>>> * Fences
>>>>> * Fences mark an event in the GPUs pipeline and are used @@
>>>>> -104,6
>>>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring *ring,
>>>>> +u32
>>>> seq)
>>>>> *drv->cpu_addr = cpu_to_le32(seq);
>>>>> }
>>>>>
>>>>> +/**
>>>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>>>> + *
>>>>> + * @ring: ring the fence is associated with
>>>>> + *
>>>>> + * Returns the value of the fence wait timeout.
>>>>> + */
>>>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) { long
>>>>> +tmo_gfx, tmo_mm, tmo; struct amdgpu_device *adev = ring->adev;
>>>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT; if
>> (amdgpu_sriov_vf(adev))
>>>>> +{ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT; } if
>>>>> +(amdgpu_sriov_runtime(adev)) { tmo_gfx = 8 *
>> AMDGPU_FENCE_TIMEOUT;
>>>>> +} else if (adev->gmc.xgmi.hive_id) { tmo_gfx =
>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT; } if (ring->funcs->type ==
>>>>> +AMDGPU_RING_TYPE_UVD ||
>>>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE || type ==
>>>>> +ring->funcs->AMDGPU_RING_TYPE_UVD_ENC || type ==
>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_DEC || type ==
>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_ENC || type ==
>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_JPEG)
>>>>> +tmo = tmo_mm;
>>>>> +else
>>>>> +tmo = tmo_gfx;
>>>>> +return tmo;
>>>>> +}
>>>>> +
>>>>> /**
>>>>> * amdgpu_fence_read - read a fence value
>>>>> *
>>>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring
>>>>> *ring,
>>>> struct dma_fence **f,
>>>>> rcu_read_unlock();
>>>>>
>>>>> if (old) {
>>>>> -r = dma_fence_wait(old, false);
>>>>> +long timeout;
>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>> +dma_fence_wait_timeout(old, false, timeout);
>>>>> dma_fence_put(old);
>>>>> if (r)
>>>>> -return r;
>>>>> +return r < 0 ? r : 0;
>>>>> }
>>>>> }
>>>>>
>>>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>>>> amdgpu_ring *ring)
>>>>> return 0;
>>>>> }
>>>>> rcu_read_unlock();
>>>>> -
>>>>> -r = dma_fence_wait(fence, false);
>>>>> +
>>>>> +long timeout;
>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>> +dma_fence_wait_timeout(fence, false, timeout);
>>>>> dma_fence_put(fence);
>>>>> -return r;
>>>>> +return r < 0 ? r : 0;
>>>>> }
>>>>>
>>>>> /**
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flis
>>>> ts.f
>>>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>>>>
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>> df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>> C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>> ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>>>> d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-18 7:48 ` Christian König
@ 2021-01-18 11:56 ` Deng, Emily
2021-01-18 16:03 ` Christian König
0 siblings, 1 reply; 12+ messages in thread
From: Deng, Emily @ 2021-01-18 11:56 UTC (permalink / raw)
To: Koenig, Christian, Sun, Roy, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - Internal Distribution Only]
>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Monday, January 18, 2021 3:49 PM
>To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>
>Mhm, we could change amdgpu_fence_wait_empty() to timeout. But I think
>that waiting forever here is intentional and the right thing to do.
>
>What happens is that we wait for the hardware to make sure that nothing is
>writing to any memory before we unload the driver.
>
>Now the VCN block has crashed and doesn't respond, but we can't guarantee
>that it is not accidentally writing anywhere.
>
>The only alternative we have is to time out and proceed with the driver unload,
>risking corrupting the memory we free during that should the hardware
>continue to do something.
Hi Christian,
Thanks your suggestion, but still not quite clearly, could you detail the solution to avoid kernel not lockup?
>
>Regards,
>Christian.
>
>Am 18.01.21 um 03:01 schrieb Deng, Emily:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Thursday, January 14, 2021 9:50 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>
>>> Am 14.01.21 um 03:00 schrieb Deng, Emily:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>>> -----Original Message-----
>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>> Christian König
>>>>> Sent: Wednesday, January 13, 2021 10:03 PM
>>>>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>>
>>>>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>>>>> This fix bug where when the engine hang, the fence ring will wait
>>>>>> without quit and cause kernel crash
>>>>> NAK, this blocking is intentional unlimited because otherwise we
>>>>> will cause a memory corruption.
>>>>>
>>>>> What is the actual bug you are trying to fix here?
>>>> When some engine hang during initialization, the IB test will fail,
>>>> and fence will never come back, then never could wait the fence back.
>>>> Why we need to wait here forever? We'd better not use forever wait
>>>> which
>>> will cause kernel crash and lockup. And we have
>>> amdgpu_fence_driver_force_completion will let memory free. We should
>>> remove all those forever wait, and replace them with one timeout,
>>> and do the correct error process if timeout.
>>>
>>> This wait here is to make sure we never overwrite the software fence
>>> ring buffer. Without it we would not signal all fences in
>>> amdgpu_fence_driver_force_completion() and cause either memory leak
>>> or corruption.
>>>
>>> Waiting here forever is the right thing to do even when that means
>>> that the submission thread gets stuck forever, cause that is still
>>> better than memory corruption.
>>>
>>> But this should never happen in practice and is only here for
>>> precaution. So do you really see that in practice?
>> Yes, we hit the issue when vcn ib test fail. Could you give some suggestions
>about how to fix this?
>> [ 958.301685] failed to read reg:1a6c0
>>
>> [ 959.036645] gmc_v10_0_process_interrupt: 42 callbacks suppressed
>>
>> [ 959.036653] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.038043] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.039014] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.040202] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.041174] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.042353] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.043325] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.044508] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.045480] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.046659] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.047631] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.048815] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.049787] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.050973] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 959.051950] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>
>> [ 959.053123] amdgpu 0000:00:07.0: in page starting at address
>0x0000000000567000 from client 18
>>
>> [ 967.208705] amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]]
>*ERROR* IB test failed on vcn_enc0 (-110).
>>
>> [ 967.209879] [drm:amdgpu_device_init [amdgpu]] *ERROR* ib ring test
>failed (-110).
>>
>>
>>
>> [ 1209.384668] INFO: task modprobe:23957 blocked for more than 120
>seconds.
>>
>> [ 1209.385605] Tainted: G OE 5.4.0-58-generic #64~18.04.1-
>Ubuntu
>>
>> [ 1209.386451] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
>disables this message.
>>
>> [ 1209.387342] modprobe D 0 23957 1221 0x80004006
>>
>> [ 1209.387344] Call Trace:
>>
>> [ 1209.387354] __schedule+0x293/0x720
>>
>> [ 1209.387356] schedule+0x33/0xa0
>>
>> [ 1209.387357] schedule_timeout+0x1d3/0x320
>>
>> [ 1209.387359] ? schedule+0x33/0xa0
>>
>> [ 1209.387360] ? schedule_timeout+0x1d3/0x320
>>
>> [ 1209.387363] dma_fence_default_wait+0x1b2/0x1e0
>>
>> [ 1209.387364] ? dma_fence_release+0x130/0x130
>>
>> [ 1209.387366] dma_fence_wait_timeout+0xfd/0x110
>>
>> [ 1209.387773] amdgpu_fence_wait_empty+0x90/0xc0 [amdgpu]
>>
>> [ 1209.387839] amdgpu_fence_driver_fini+0xd6/0x110 [amdgpu]
>>> Regards,
>>> Christian.
>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>>>>> ++++++++++++++++++++---
>>>>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> index 6b0aeee61b8b..738ea65077ea 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>> @@ -41,6 +41,8 @@
>>>>>> #include "amdgpu.h"
>>>>>> #include "amdgpu_trace.h"
>>>>>>
>>>>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>>>>> /*
>>>>>> * Fences
>>>>>> * Fences mark an event in the GPUs pipeline and are used @@
>>>>>> -104,6
>>>>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring
>>>>>> +*ring,
>>>>>> +u32
>>>>> seq)
>>>>>> *drv->cpu_addr = cpu_to_le32(seq);
>>>>>> }
>>>>>>
>>>>>> +/**
>>>>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>>>>> + *
>>>>>> + * @ring: ring the fence is associated with
>>>>>> + *
>>>>>> + * Returns the value of the fence wait timeout.
>>>>>> + */
>>>>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) { long
>>>>>> +tmo_gfx, tmo_mm, tmo; struct amdgpu_device *adev = ring->adev;
>>>>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT; if
>>> (amdgpu_sriov_vf(adev))
>>>>>> +{ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT; } if
>>>>>> +(amdgpu_sriov_runtime(adev)) { tmo_gfx = 8 *
>>> AMDGPU_FENCE_TIMEOUT;
>>>>>> +} else if (adev->gmc.xgmi.hive_id) { tmo_gfx =
>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT; } if (ring->funcs->type ==
>>>>>> +AMDGPU_RING_TYPE_UVD ||
>>>>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE || type ==
>>>>>> +ring->funcs->AMDGPU_RING_TYPE_UVD_ENC || type ==
>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_DEC || type ==
>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_ENC || type ==
>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_JPEG)
>>>>>> +tmo = tmo_mm;
>>>>>> +else
>>>>>> +tmo = tmo_gfx;
>>>>>> +return tmo;
>>>>>> +}
>>>>>> +
>>>>>> /**
>>>>>> * amdgpu_fence_read - read a fence value
>>>>>> *
>>>>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring
>>>>>> *ring,
>>>>> struct dma_fence **f,
>>>>>> rcu_read_unlock();
>>>>>>
>>>>>> if (old) {
>>>>>> -r = dma_fence_wait(old, false);
>>>>>> +long timeout;
>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>> +dma_fence_wait_timeout(old, false, timeout);
>>>>>> dma_fence_put(old);
>>>>>> if (r)
>>>>>> -return r;
>>>>>> +return r < 0 ? r : 0;
>>>>>> }
>>>>>> }
>>>>>>
>>>>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>>>>> amdgpu_ring *ring)
>>>>>> return 0;
>>>>>> }
>>>>>> rcu_read_unlock();
>>>>>> -
>>>>>> -r = dma_fence_wait(fence, false);
>>>>>> +
>>>>>> +long timeout;
>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>> +dma_fence_wait_timeout(fence, false, timeout);
>>>>>> dma_fence_put(fence);
>>>>>> -return r;
>>>>>> +return r < 0 ? r : 0;
>>>>>> }
>>>>>>
>>>>>> /**
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fl
>>>>> is
>>>>> ts.f
>>>>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>>>>>
>>>
>gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>>>
>df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>>>
>C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>>>
>MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>>>
>ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>>>>> d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-18 11:56 ` Deng, Emily
@ 2021-01-18 16:03 ` Christian König
2021-01-19 3:23 ` Deng, Emily
0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2021-01-18 16:03 UTC (permalink / raw)
To: Deng, Emily, Koenig, Christian, Sun, Roy,
amd-gfx@lists.freedesktop.org
Am 18.01.21 um 12:56 schrieb Deng, Emily:
> [AMD Official Use Only - Internal Distribution Only]
>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Monday, January 18, 2021 3:49 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>> amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>
>> Mhm, we could change amdgpu_fence_wait_empty() to timeout. But I think
>> that waiting forever here is intentional and the right thing to do.
>>
>> What happens is that we wait for the hardware to make sure that nothing is
>> writing to any memory before we unload the driver.
>>
>> Now the VCN block has crashed and doesn't respond, but we can't guarantee
>> that it is not accidentally writing anywhere.
>>
>> The only alternative we have is to time out and proceed with the driver unload,
>> risking corrupting the memory we free during that should the hardware
>> continue to do something.
> Hi Christian,
> Thanks your suggestion, but still not quite clearly, could you detail the solution to avoid kernel not lockup?
Well as I said that the kernel locks up is intentional here.
Regards,
Christian.
>> Regards,
>> Christian.
>>
>> Am 18.01.21 um 03:01 schrieb Deng, Emily:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Thursday, January 14, 2021 9:50 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>
>>>> Am 14.01.21 um 03:00 schrieb Deng, Emily:
>>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>>> Christian König
>>>>>> Sent: Wednesday, January 13, 2021 10:03 PM
>>>>>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>>>
>>>>>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>>>>>> This fix bug where when the engine hang, the fence ring will wait
>>>>>>> without quit and cause kernel crash
>>>>>> NAK, this blocking is intentional unlimited because otherwise we
>>>>>> will cause a memory corruption.
>>>>>>
>>>>>> What is the actual bug you are trying to fix here?
>>>>> When some engine hang during initialization, the IB test will fail,
>>>>> and fence will never come back, then never could wait the fence back.
>>>>> Why we need to wait here forever? We'd better not use forever wait
>>>>> which
>>>> will cause kernel crash and lockup. And we have
>>>> amdgpu_fence_driver_force_completion will let memory free. We should
>>>> remove all those forever wait, and replace them with one timeout,
>>>> and do the correct error process if timeout.
>>>>
>>>> This wait here is to make sure we never overwrite the software fence
>>>> ring buffer. Without it we would not signal all fences in
>>>> amdgpu_fence_driver_force_completion() and cause either memory leak
>>>> or corruption.
>>>>
>>>> Waiting here forever is the right thing to do even when that means
>>>> that the submission thread gets stuck forever, cause that is still
>>>> better than memory corruption.
>>>>
>>>> But this should never happen in practice and is only here for
>>>> precaution. So do you really see that in practice?
>>> Yes, we hit the issue when vcn ib test fail. Could you give some suggestions
>> about how to fix this?
>>> [ 958.301685] failed to read reg:1a6c0
>>>
>>> [ 959.036645] gmc_v10_0_process_interrupt: 42 callbacks suppressed
>>>
>>> [ 959.036653] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.038043] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.039014] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.040202] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.041174] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.042353] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.043325] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.044508] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.045480] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.046659] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.047631] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.048815] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.049787] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.050973] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 959.051950] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>
>>> [ 959.053123] amdgpu 0000:00:07.0: in page starting at address
>> 0x0000000000567000 from client 18
>>> [ 967.208705] amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]]
>> *ERROR* IB test failed on vcn_enc0 (-110).
>>> [ 967.209879] [drm:amdgpu_device_init [amdgpu]] *ERROR* ib ring test
>> failed (-110).
>>>
>>>
>>> [ 1209.384668] INFO: task modprobe:23957 blocked for more than 120
>> seconds.
>>> [ 1209.385605] Tainted: G OE 5.4.0-58-generic #64~18.04.1-
>> Ubuntu
>>> [ 1209.386451] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
>> disables this message.
>>> [ 1209.387342] modprobe D 0 23957 1221 0x80004006
>>>
>>> [ 1209.387344] Call Trace:
>>>
>>> [ 1209.387354] __schedule+0x293/0x720
>>>
>>> [ 1209.387356] schedule+0x33/0xa0
>>>
>>> [ 1209.387357] schedule_timeout+0x1d3/0x320
>>>
>>> [ 1209.387359] ? schedule+0x33/0xa0
>>>
>>> [ 1209.387360] ? schedule_timeout+0x1d3/0x320
>>>
>>> [ 1209.387363] dma_fence_default_wait+0x1b2/0x1e0
>>>
>>> [ 1209.387364] ? dma_fence_release+0x130/0x130
>>>
>>> [ 1209.387366] dma_fence_wait_timeout+0xfd/0x110
>>>
>>> [ 1209.387773] amdgpu_fence_wait_empty+0x90/0xc0 [amdgpu]
>>>
>>> [ 1209.387839] amdgpu_fence_driver_fini+0xd6/0x110 [amdgpu]
>>>> Regards,
>>>> Christian.
>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>>>>>> ---
>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>>>>>> ++++++++++++++++++++---
>>>>>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> index 6b0aeee61b8b..738ea65077ea 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>> @@ -41,6 +41,8 @@
>>>>>>> #include "amdgpu.h"
>>>>>>> #include "amdgpu_trace.h"
>>>>>>>
>>>>>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000) #define
>>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>>>>>> /*
>>>>>>> * Fences
>>>>>>> * Fences mark an event in the GPUs pipeline and are used @@
>>>>>>> -104,6
>>>>>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring
>>>>>>> +*ring,
>>>>>>> +u32
>>>>>> seq)
>>>>>>> *drv->cpu_addr = cpu_to_le32(seq);
>>>>>>> }
>>>>>>>
>>>>>>> +/**
>>>>>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>>>>>> + *
>>>>>>> + * @ring: ring the fence is associated with
>>>>>>> + *
>>>>>>> + * Returns the value of the fence wait timeout.
>>>>>>> + */
>>>>>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) { long
>>>>>>> +tmo_gfx, tmo_mm, tmo; struct amdgpu_device *adev = ring->adev;
>>>>>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT; if
>>>> (amdgpu_sriov_vf(adev))
>>>>>>> +{ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT; } if
>>>>>>> +(amdgpu_sriov_runtime(adev)) { tmo_gfx = 8 *
>>>> AMDGPU_FENCE_TIMEOUT;
>>>>>>> +} else if (adev->gmc.xgmi.hive_id) { tmo_gfx =
>>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT; } if (ring->funcs->type ==
>>>>>>> +AMDGPU_RING_TYPE_UVD ||
>>>>>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE || type ==
>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_UVD_ENC || type ==
>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_DEC || type ==
>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_ENC || type ==
>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_JPEG)
>>>>>>> +tmo = tmo_mm;
>>>>>>> +else
>>>>>>> +tmo = tmo_gfx;
>>>>>>> +return tmo;
>>>>>>> +}
>>>>>>> +
>>>>>>> /**
>>>>>>> * amdgpu_fence_read - read a fence value
>>>>>>> *
>>>>>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct amdgpu_ring
>>>>>>> *ring,
>>>>>> struct dma_fence **f,
>>>>>>> rcu_read_unlock();
>>>>>>>
>>>>>>> if (old) {
>>>>>>> -r = dma_fence_wait(old, false);
>>>>>>> +long timeout;
>>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>>> +dma_fence_wait_timeout(old, false, timeout);
>>>>>>> dma_fence_put(old);
>>>>>>> if (r)
>>>>>>> -return r;
>>>>>>> +return r < 0 ? r : 0;
>>>>>>> }
>>>>>>> }
>>>>>>>
>>>>>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>>>>>> amdgpu_ring *ring)
>>>>>>> return 0;
>>>>>>> }
>>>>>>> rcu_read_unlock();
>>>>>>> -
>>>>>>> -r = dma_fence_wait(fence, false);
>>>>>>> +
>>>>>>> +long timeout;
>>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>>> +dma_fence_wait_timeout(fence, false, timeout);
>>>>>>> dma_fence_put(fence);
>>>>>>> -return r;
>>>>>>> +return r < 0 ? r : 0;
>>>>>>> }
>>>>>>>
>>>>>>> /**
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fl
>>>>>> is
>>>>>> ts.f
>>>>>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>>>>>>
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>> df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>> C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>> ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>>>>>> d=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-18 16:03 ` Christian König
@ 2021-01-19 3:23 ` Deng, Emily
2021-01-19 8:05 ` Christian König
0 siblings, 1 reply; 12+ messages in thread
From: Deng, Emily @ 2021-01-19 3:23 UTC (permalink / raw)
To: Koenig, Christian, Sun, Roy, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - Internal Distribution Only]
>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>Christian König
>Sent: Tuesday, January 19, 2021 12:04 AM
>To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian
><Christian.Koenig@amd.com>; Sun, Roy <Roy.Sun@amd.com>; amd-
>gfx@lists.freedesktop.org
>Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>
>Am 18.01.21 um 12:56 schrieb Deng, Emily:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Monday, January 18, 2021 3:49 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>
>>> Mhm, we could change amdgpu_fence_wait_empty() to timeout. But I
>>> think that waiting forever here is intentional and the right thing to do.
>>>
>>> What happens is that we wait for the hardware to make sure that
>>> nothing is writing to any memory before we unload the driver.
>>>
>>> Now the VCN block has crashed and doesn't respond, but we can't
>>> guarantee that it is not accidentally writing anywhere.
>>>
>>> The only alternative we have is to time out and proceed with the
>>> driver unload, risking corrupting the memory we free during that
>>> should the hardware continue to do something.
>> Hi Christian,
>> Thanks your suggestion, but still not quite clearly, could you detail the
>solution to avoid kernel not lockup?
>
>Well as I said that the kernel locks up is intentional here.
So you think the lock up is better than only some memory corruption? Because we could give more time, such as 60s to wait, I don't think the fence won't be signaled within 60s if the engine is good. So
when the engine is ok, it won't cause memory corruption with the timeout. When engine is bad, the fence will never be signaled, so even we force completion, it still won't cause memory corruption. As for sriov, when engine
is bad, we still could do recover, and do driver reload to make the driver works ok again, so we don't want the kernel lockup.
>
>Regards,
>Christian.
>
>>> Regards,
>>> Christian.
>>>
>>> Am 18.01.21 um 03:01 schrieb Deng, Emily:
>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Thursday, January 14, 2021 9:50 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy
><Roy.Sun@amd.com>;
>>>>> amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>>
>>>>> Am 14.01.21 um 03:00 schrieb Deng, Emily:
>>>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf
>>>>>>> Of Christian König
>>>>>>> Sent: Wednesday, January 13, 2021 10:03 PM
>>>>>>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait
>>>>>>> timeout
>>>>>>>
>>>>>>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>>>>>>> This fix bug where when the engine hang, the fence ring will
>>>>>>>> wait without quit and cause kernel crash
>>>>>>> NAK, this blocking is intentional unlimited because otherwise we
>>>>>>> will cause a memory corruption.
>>>>>>>
>>>>>>> What is the actual bug you are trying to fix here?
>>>>>> When some engine hang during initialization, the IB test will
>>>>>> fail, and fence will never come back, then never could wait the fence
>back.
>>>>>> Why we need to wait here forever? We'd better not use forever wait
>>>>>> which
>>>>> will cause kernel crash and lockup. And we have
>>>>> amdgpu_fence_driver_force_completion will let memory free. We
>>>>> should remove all those forever wait, and replace them with one
>>>>> timeout, and do the correct error process if timeout.
>>>>>
>>>>> This wait here is to make sure we never overwrite the software
>>>>> fence ring buffer. Without it we would not signal all fences in
>>>>> amdgpu_fence_driver_force_completion() and cause either memory
>leak
>>>>> or corruption.
>>>>>
>>>>> Waiting here forever is the right thing to do even when that means
>>>>> that the submission thread gets stuck forever, cause that is still
>>>>> better than memory corruption.
>>>>>
>>>>> But this should never happen in practice and is only here for
>>>>> precaution. So do you really see that in practice?
>>>> Yes, we hit the issue when vcn ib test fail. Could you give some
>>>> suggestions
>>> about how to fix this?
>>>> [ 958.301685] failed to read reg:1a6c0
>>>>
>>>> [ 959.036645] gmc_v10_0_process_interrupt: 42 callbacks suppressed
>>>>
>>>> [ 959.036653] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.038043] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.039014] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.040202] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.041174] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.042353] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.043325] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.044508] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.045480] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.046659] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.047631] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.048815] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.049787] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.050973] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 959.051950] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>
>>>> [ 959.053123] amdgpu 0000:00:07.0: in page starting at address
>>> 0x0000000000567000 from client 18
>>>> [ 967.208705] amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests
>>>> [amdgpu]]
>>> *ERROR* IB test failed on vcn_enc0 (-110).
>>>> [ 967.209879] [drm:amdgpu_device_init [amdgpu]] *ERROR* ib ring
>>>> test
>>> failed (-110).
>>>>
>>>>
>>>> [ 1209.384668] INFO: task modprobe:23957 blocked for more than 120
>>> seconds.
>>>> [ 1209.385605] Tainted: G OE 5.4.0-58-generic #64~18.04.1-
>>> Ubuntu
>>>> [ 1209.386451] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
>>> disables this message.
>>>> [ 1209.387342] modprobe D 0 23957 1221 0x80004006
>>>>
>>>> [ 1209.387344] Call Trace:
>>>>
>>>> [ 1209.387354] __schedule+0x293/0x720
>>>>
>>>> [ 1209.387356] schedule+0x33/0xa0
>>>>
>>>> [ 1209.387357] schedule_timeout+0x1d3/0x320
>>>>
>>>> [ 1209.387359] ? schedule+0x33/0xa0
>>>>
>>>> [ 1209.387360] ? schedule_timeout+0x1d3/0x320
>>>>
>>>> [ 1209.387363] dma_fence_default_wait+0x1b2/0x1e0
>>>>
>>>> [ 1209.387364] ? dma_fence_release+0x130/0x130
>>>>
>>>> [ 1209.387366] dma_fence_wait_timeout+0xfd/0x110
>>>>
>>>> [ 1209.387773] amdgpu_fence_wait_empty+0x90/0xc0 [amdgpu]
>>>>
>>>> [ 1209.387839] amdgpu_fence_driver_fini+0xd6/0x110 [amdgpu]
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>>>>>>> ---
>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>>>>>>> ++++++++++++++++++++---
>>>>>>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> index 6b0aeee61b8b..738ea65077ea 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> @@ -41,6 +41,8 @@
>>>>>>>> #include "amdgpu.h"
>>>>>>>> #include "amdgpu_trace.h"
>>>>>>>>
>>>>>>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000)
>#define
>>>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>>>>>>> /*
>>>>>>>> * Fences
>>>>>>>> * Fences mark an event in the GPUs pipeline and are used
>>>>>>>> @@
>>>>>>>> -104,6
>>>>>>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring
>>>>>>>> +*ring,
>>>>>>>> +u32
>>>>>>> seq)
>>>>>>>> *drv->cpu_addr = cpu_to_le32(seq);
>>>>>>>> }
>>>>>>>>
>>>>>>>> +/**
>>>>>>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>>>>>>> + *
>>>>>>>> + * @ring: ring the fence is associated with
>>>>>>>> + *
>>>>>>>> + * Returns the value of the fence wait timeout.
>>>>>>>> + */
>>>>>>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) { long
>>>>>>>> +tmo_gfx, tmo_mm, tmo; struct amdgpu_device *adev = ring->adev;
>>>>>>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT; if
>>>>> (amdgpu_sriov_vf(adev))
>>>>>>>> +{ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT; } if
>>>>>>>> +(amdgpu_sriov_runtime(adev)) { tmo_gfx = 8 *
>>>>> AMDGPU_FENCE_TIMEOUT;
>>>>>>>> +} else if (adev->gmc.xgmi.hive_id) { tmo_gfx =
>>>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT; } if (ring->funcs->type ==
>>>>>>>> +AMDGPU_RING_TYPE_UVD ||
>>>>>>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE || type ==
>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_UVD_ENC || type ==
>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_DEC || type ==
>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_ENC || type ==
>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_JPEG)
>>>>>>>> +tmo = tmo_mm;
>>>>>>>> +else
>>>>>>>> +tmo = tmo_gfx;
>>>>>>>> +return tmo;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> /**
>>>>>>>> * amdgpu_fence_read - read a fence value
>>>>>>>> *
>>>>>>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct
>amdgpu_ring
>>>>>>>> *ring,
>>>>>>> struct dma_fence **f,
>>>>>>>> rcu_read_unlock();
>>>>>>>>
>>>>>>>> if (old) {
>>>>>>>> -r = dma_fence_wait(old, false);
>>>>>>>> +long timeout;
>>>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>>>> +dma_fence_wait_timeout(old, false, timeout);
>>>>>>>> dma_fence_put(old);
>>>>>>>> if (r)
>>>>>>>> -return r;
>>>>>>>> +return r < 0 ? r : 0;
>>>>>>>> }
>>>>>>>> }
>>>>>>>>
>>>>>>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>>>>>>> amdgpu_ring *ring)
>>>>>>>> return 0;
>>>>>>>> }
>>>>>>>> rcu_read_unlock();
>>>>>>>> -
>>>>>>>> -r = dma_fence_wait(fence, false);
>>>>>>>> +
>>>>>>>> +long timeout;
>>>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>>>> +dma_fence_wait_timeout(fence, false, timeout);
>>>>>>>> dma_fence_put(fence);
>>>>>>>> -return r;
>>>>>>>> +return r < 0 ? r : 0;
>>>>>>>> }
>>>>>>>>
>>>>>>>> /**
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2
>>>>>>> Fl
>>>>>>> is
>>>>>>> ts.f
>>>>>>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>>>>>>>
>>>
>gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>>>
>df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>>>
>C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>>>
>MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>>>
>ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>>>>>>> d=0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=04%7C01%7CEm
>>
>ily.Deng%40amd.com%7C5a6d3a94dc7d4190477a08d8bbcaa509%7C3dd896
>1fe4884e
>>
>608e11a82d994e183d%7C0%7C0%7C637465826316473967%7CUnknown%7
>CTWFpbGZsb3
>>
>d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%
>3D%7
>>
>C1000&sdata=7XUhFg5H0FTO8RByofukETH7yBz9RBHKkUQmHZP3z34%3
>D&res
>> erved=0
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.f
>reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C5a6d3a94dc7d4190
>477a08d8bbcaa509%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>C637465826316473967%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>ta=7XUhFg5H0FTO8RByofukETH7yBz9RBHKkUQmHZP3z34%3D&reserve
>d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
2021-01-19 3:23 ` Deng, Emily
@ 2021-01-19 8:05 ` Christian König
0 siblings, 0 replies; 12+ messages in thread
From: Christian König @ 2021-01-19 8:05 UTC (permalink / raw)
To: Deng, Emily, Sun, Roy, amd-gfx@lists.freedesktop.org
Am 19.01.21 um 04:23 schrieb Deng, Emily:
> [AMD Official Use Only - Internal Distribution Only]
>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Tuesday, January 19, 2021 12:04 AM
>> To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; Sun, Roy <Roy.Sun@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>
>> Am 18.01.21 um 12:56 schrieb Deng, Emily:
>>> [AMD Official Use Only - Internal Distribution Only]
>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Monday, January 18, 2021 3:49 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy <Roy.Sun@amd.com>;
>>>> amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>
>>>> Mhm, we could change amdgpu_fence_wait_empty() to timeout. But I
>>>> think that waiting forever here is intentional and the right thing to do.
>>>>
>>>> What happens is that we wait for the hardware to make sure that
>>>> nothing is writing to any memory before we unload the driver.
>>>>
>>>> Now the VCN block has crashed and doesn't respond, but we can't
>>>> guarantee that it is not accidentally writing anywhere.
>>>>
>>>> The only alternative we have is to time out and proceed with the
>>>> driver unload, risking corrupting the memory we free during that
>>>> should the hardware continue to do something.
>>> Hi Christian,
>>> Thanks your suggestion, but still not quite clearly, could you detail the
>> solution to avoid kernel not lockup?
>>
>> Well as I said that the kernel locks up is intentional here.
> So you think the lock up is better than only some memory corruption?
Yes exactly.
> Because we could give more time, such as 60s to wait, I don't think the fence won't be signaled within 60s if the engine is good. So
> when the engine is ok, it won't cause memory corruption with the timeout. When engine is bad, the fence will never be signaled, so even we force completion, it still won't cause memory corruption.
Unfortunately it's not that easy. See for example some engines keep
updating statistics (how many bytes written, samples encoded etc...)
even when the engines is stuck in an endless loop.
> As for sriov, when engine
> is bad, we still could do recover, and do driver reload to make the driver works ok again, so we don't want the kernel lockup.
Well that is certainly something you could try, e.g. wait for empty with
a timeout and if that times out triggers do a GPU reset and wait again.
Alternative would be to see if we could disable system memory access
from the device all together with the PCIe config regs if we can't get
it idle again.
Regards,
Christian.
>> Regards,
>> Christian.
>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 18.01.21 um 03:01 schrieb Deng, Emily:
>>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Thursday, January 14, 2021 9:50 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; Sun, Roy
>> <Roy.Sun@amd.com>;
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait timeout
>>>>>>
>>>>>> Am 14.01.21 um 03:00 schrieb Deng, Emily:
>>>>>>> [AMD Official Use Only - Internal Distribution Only]
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf
>>>>>>>> Of Christian König
>>>>>>>> Sent: Wednesday, January 13, 2021 10:03 PM
>>>>>>>> To: Sun, Roy <Roy.Sun@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH] drm/amdgpu: change the fence ring wait
>>>>>>>> timeout
>>>>>>>>
>>>>>>>> Am 13.01.21 um 07:36 schrieb Roy Sun:
>>>>>>>>> This fix bug where when the engine hang, the fence ring will
>>>>>>>>> wait without quit and cause kernel crash
>>>>>>>> NAK, this blocking is intentional unlimited because otherwise we
>>>>>>>> will cause a memory corruption.
>>>>>>>>
>>>>>>>> What is the actual bug you are trying to fix here?
>>>>>>> When some engine hang during initialization, the IB test will
>>>>>>> fail, and fence will never come back, then never could wait the fence
>> back.
>>>>>>> Why we need to wait here forever? We'd better not use forever wait
>>>>>>> which
>>>>>> will cause kernel crash and lockup. And we have
>>>>>> amdgpu_fence_driver_force_completion will let memory free. We
>>>>>> should remove all those forever wait, and replace them with one
>>>>>> timeout, and do the correct error process if timeout.
>>>>>>
>>>>>> This wait here is to make sure we never overwrite the software
>>>>>> fence ring buffer. Without it we would not signal all fences in
>>>>>> amdgpu_fence_driver_force_completion() and cause either memory
>> leak
>>>>>> or corruption.
>>>>>>
>>>>>> Waiting here forever is the right thing to do even when that means
>>>>>> that the submission thread gets stuck forever, cause that is still
>>>>>> better than memory corruption.
>>>>>>
>>>>>> But this should never happen in practice and is only here for
>>>>>> precaution. So do you really see that in practice?
>>>>> Yes, we hit the issue when vcn ib test fail. Could you give some
>>>>> suggestions
>>>> about how to fix this?
>>>>> [ 958.301685] failed to read reg:1a6c0
>>>>>
>>>>> [ 959.036645] gmc_v10_0_process_interrupt: 42 callbacks suppressed
>>>>>
>>>>> [ 959.036653] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.038043] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.039014] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.040202] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.041174] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.042353] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.043325] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.044508] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.045480] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.046659] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.047631] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.048815] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.049787] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.050973] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 959.051950] amdgpu 0000:00:07.0: [mmhub] page fault (src_id:0
>>>>> ring:0 vmid:0 pasid:0, for process pid 0 thread pid 0)
>>>>>
>>>>> [ 959.053123] amdgpu 0000:00:07.0: in page starting at address
>>>> 0x0000000000567000 from client 18
>>>>> [ 967.208705] amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests
>>>>> [amdgpu]]
>>>> *ERROR* IB test failed on vcn_enc0 (-110).
>>>>> [ 967.209879] [drm:amdgpu_device_init [amdgpu]] *ERROR* ib ring
>>>>> test
>>>> failed (-110).
>>>>>
>>>>> [ 1209.384668] INFO: task modprobe:23957 blocked for more than 120
>>>> seconds.
>>>>> [ 1209.385605] Tainted: G OE 5.4.0-58-generic #64~18.04.1-
>>>> Ubuntu
>>>>> [ 1209.386451] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
>>>> disables this message.
>>>>> [ 1209.387342] modprobe D 0 23957 1221 0x80004006
>>>>>
>>>>> [ 1209.387344] Call Trace:
>>>>>
>>>>> [ 1209.387354] __schedule+0x293/0x720
>>>>>
>>>>> [ 1209.387356] schedule+0x33/0xa0
>>>>>
>>>>> [ 1209.387357] schedule_timeout+0x1d3/0x320
>>>>>
>>>>> [ 1209.387359] ? schedule+0x33/0xa0
>>>>>
>>>>> [ 1209.387360] ? schedule_timeout+0x1d3/0x320
>>>>>
>>>>> [ 1209.387363] dma_fence_default_wait+0x1b2/0x1e0
>>>>>
>>>>> [ 1209.387364] ? dma_fence_release+0x130/0x130
>>>>>
>>>>> [ 1209.387366] dma_fence_wait_timeout+0xfd/0x110
>>>>>
>>>>> [ 1209.387773] amdgpu_fence_wait_empty+0x90/0xc0 [amdgpu]
>>>>>
>>>>> [ 1209.387839] amdgpu_fence_driver_fini+0xd6/0x110 [amdgpu]
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>>> Signed-off-by: Roy Sun <Roy.Sun@amd.com>
>>>>>>>>> ---
>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 48
>>>>>>>> ++++++++++++++++++++---
>>>>>>>>> 1 file changed, 43 insertions(+), 5 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>>> index 6b0aeee61b8b..738ea65077ea 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>>> @@ -41,6 +41,8 @@
>>>>>>>>> #include "amdgpu.h"
>>>>>>>>> #include "amdgpu_trace.h"
>>>>>>>>>
>>>>>>>>> +#define AMDGPU_FENCE_TIMEOUT msecs_to_jiffies(1000)
>> #define
>>>>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT msecs_to_jiffies(2000)
>>>>>>>>> /*
>>>>>>>>> * Fences
>>>>>>>>> * Fences mark an event in the GPUs pipeline and are used
>>>>>>>>> @@
>>>>>>>>> -104,6
>>>>>>>>> +106,38 @@ static void amdgpu_fence_write(struct amdgpu_ring
>>>>>>>>> +*ring,
>>>>>>>>> +u32
>>>>>>>> seq)
>>>>>>>>> *drv->cpu_addr = cpu_to_le32(seq);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> +/**
>>>>>>>>> + * amdgpu_fence_wait_timeout - get the fence wait timeout
>>>>>>>>> + *
>>>>>>>>> + * @ring: ring the fence is associated with
>>>>>>>>> + *
>>>>>>>>> + * Returns the value of the fence wait timeout.
>>>>>>>>> + */
>>>>>>>>> +long amdgpu_fence_wait_timeout(struct amdgpu_ring *ring) { long
>>>>>>>>> +tmo_gfx, tmo_mm, tmo; struct amdgpu_device *adev = ring->adev;
>>>>>>>>> +tmo_mm = tmo_gfx = AMDGPU_FENCE_TIMEOUT; if
>>>>>> (amdgpu_sriov_vf(adev))
>>>>>>>>> +{ tmo_mm = 8 * AMDGPU_FENCE_TIMEOUT; } if
>>>>>>>>> +(amdgpu_sriov_runtime(adev)) { tmo_gfx = 8 *
>>>>>> AMDGPU_FENCE_TIMEOUT;
>>>>>>>>> +} else if (adev->gmc.xgmi.hive_id) { tmo_gfx =
>>>>>>>>> +AMDGPU_FENCE_GFX_XGMI_TIMEOUT; } if (ring->funcs->type ==
>>>>>>>>> +AMDGPU_RING_TYPE_UVD ||
>>>>>>>>> +ring->funcs->type == AMDGPU_RING_TYPE_VCE || type ==
>>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_UVD_ENC || type ==
>>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_DEC || type ==
>>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_ENC || type ==
>>>>>>>>> +ring->funcs->AMDGPU_RING_TYPE_VCN_JPEG)
>>>>>>>>> +tmo = tmo_mm;
>>>>>>>>> +else
>>>>>>>>> +tmo = tmo_gfx;
>>>>>>>>> +return tmo;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> /**
>>>>>>>>> * amdgpu_fence_read - read a fence value
>>>>>>>>> *
>>>>>>>>> @@ -166,10 +200,12 @@ int amdgpu_fence_emit(struct
>> amdgpu_ring
>>>>>>>>> *ring,
>>>>>>>> struct dma_fence **f,
>>>>>>>>> rcu_read_unlock();
>>>>>>>>>
>>>>>>>>> if (old) {
>>>>>>>>> -r = dma_fence_wait(old, false);
>>>>>>>>> +long timeout;
>>>>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>>>>> +dma_fence_wait_timeout(old, false, timeout);
>>>>>>>>> dma_fence_put(old);
>>>>>>>>> if (r)
>>>>>>>>> -return r;
>>>>>>>>> +return r < 0 ? r : 0;
>>>>>>>>> }
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> @@ -343,10 +379,12 @@ int amdgpu_fence_wait_empty(struct
>>>>>>>> amdgpu_ring *ring)
>>>>>>>>> return 0;
>>>>>>>>> }
>>>>>>>>> rcu_read_unlock();
>>>>>>>>> -
>>>>>>>>> -r = dma_fence_wait(fence, false);
>>>>>>>>> +
>>>>>>>>> +long timeout;
>>>>>>>>> +timeout = amdgpu_fence_wait_timeout(ring); r =
>>>>>>>>> +dma_fence_wait_timeout(fence, false, timeout);
>>>>>>>>> dma_fence_put(fence);
>>>>>>>>> -return r;
>>>>>>>>> +return r < 0 ? r : 0;
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> /**
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2
>>>>>>>> Fl
>>>>>>>> is
>>>>>>>> ts.f
>>>>>>>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>>>>>>>>
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C8b116229938b463
>> df87f08d8b7cbf607%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>> C637461433936049544%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>> ta=HOcLHmmblOUHXATFBl5HC6LOmFq0oXglAh2GFwd6sus%3D&reserve
>>>>>>>> d=0
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
>>> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-
>> gfx&data=04%7C01%7CEm
>> ily.Deng%40amd.com%7C5a6d3a94dc7d4190477a08d8bbcaa509%7C3dd896
>> 1fe4884e
>> 608e11a82d994e183d%7C0%7C0%7C637465826316473967%7CUnknown%7
>> CTWFpbGZsb3
>> d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%
>> 3D%7
>> C1000&sdata=7XUhFg5H0FTO8RByofukETH7yBz9RBHKkUQmHZP3z34%3
>> D&res
>>> erved=0
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.f
>> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
>> gfx&data=04%7C01%7CEmily.Deng%40amd.com%7C5a6d3a94dc7d4190
>> 477a08d8bbcaa509%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7
>> C637465826316473967%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAw
>> MDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sda
>> ta=7XUhFg5H0FTO8RByofukETH7yBz9RBHKkUQmHZP3z34%3D&reserve
>> d=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2021-01-19 8:06 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-01-13 6:36 [PATCH] drm/amdgpu: change the fence ring wait timeout Roy Sun
2021-01-13 7:05 ` Deng, Emily
2021-01-13 7:57 ` Paul Menzel
2021-01-13 14:03 ` Christian König
2021-01-14 2:00 ` Deng, Emily
2021-01-14 13:49 ` Christian König
2021-01-18 2:01 ` Deng, Emily
2021-01-18 7:48 ` Christian König
2021-01-18 11:56 ` Deng, Emily
2021-01-18 16:03 ` Christian König
2021-01-19 3:23 ` Deng, Emily
2021-01-19 8:05 ` Christian König
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.