* [PATCH 1/5] drm/amdgpu: Add work pool to reset domain
2023-08-11 6:02 [PATCH 0/5] Add work pool to reset domain Lijo Lazar
@ 2023-08-11 6:02 ` Lijo Lazar
2023-08-11 6:02 ` [PATCH 2/5] drm/amdgpu: Move to reset_schedule_work Lijo Lazar
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Lijo Lazar @ 2023-08-11 6:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Asad.Kamal, Hawking.Zhang
Add a work pool to reset domain. The work pool will be used to schedule
any task in the reset domain. If on successful reset of the domain
indicated by a flag in reset context, all work that are queued will be
drained. Their work handlers won't be executed.
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 104 +++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 22 +++++
2 files changed, 125 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 02d874799c16..713362a60c9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -117,6 +117,51 @@ void amdgpu_reset_destroy_reset_domain(struct kref *ref)
kvfree(reset_domain);
}
+static void amdgpu_reset_domain_cancel_all_work(struct work_struct *work)
+{
+ struct amdgpu_reset_domain *reset_domain =
+ container_of(work, struct amdgpu_reset_domain, clear);
+ int i;
+
+ for (i = 0; i < AMDGPU_MAX_RESET_WORK; ++i)
+ if (atomic_cmpxchg(&reset_domain->work[i].in_use, 1, 0))
+ cancel_work(&reset_domain->work[i].work);
+
+ drain_workqueue(reset_domain->wq);
+ reset_domain->drain = false;
+}
+
+static void amdgpu_reset_work_handler(struct work_struct *work)
+{
+ struct amdgpu_reset_work *reset_work =
+ container_of(work, struct amdgpu_reset_work, work);
+
+ /* Don't do anything if reset domain is in drain mode */
+ if (reset_work->domain->drain)
+ return;
+
+ reset_work->handler(&reset_work->context);
+ if (reset_work->context.flags & (1U << AMDGPU_RESET_CANCEL_ALL)) {
+ reset_work->domain->drain = true;
+ schedule_work(&reset_work->domain->clear);
+ }
+
+ atomic_set(&reset_work->in_use, 0);
+}
+
+static void
+amdgpu_reset_init_work_pool(struct amdgpu_reset_domain *reset_domain)
+{
+ int i;
+
+ for (i = 0; i < AMDGPU_MAX_RESET_WORK; ++i) {
+ INIT_WORK(&reset_domain->work[i].work,
+ amdgpu_reset_work_handler);
+ atomic_set(&reset_domain->work[i].in_use, 0);
+ reset_domain->work[i].domain = reset_domain;
+ }
+}
+
struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
char *wq_name)
{
@@ -139,6 +184,8 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
}
+ INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
+ amdgpu_reset_init_work_pool(reset_domain);
atomic_set(&reset_domain->in_gpu_reset, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
@@ -152,12 +199,67 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
down_write(&reset_domain->sem);
}
-
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
{
atomic_set(&reset_domain->in_gpu_reset, 0);
up_write(&reset_domain->sem);
}
+static int
+amdgpu_reset_domain_get_work(struct amdgpu_reset_domain *reset_domain,
+ struct amdgpu_reset_work **reset_work)
+{
+ int i;
+ if (!reset_work)
+ return -EINVAL;
+
+ *reset_work = NULL;
+ for (i = 0; i < AMDGPU_MAX_RESET_WORK; ++i) {
+ if (!atomic_cmpxchg(&reset_domain->work[i].in_use, 0, 1)) {
+ *reset_work = &reset_domain->work[i];
+ return 0;
+ }
+ }
+ /* All resources occupied */
+
+ return -EBUSY;
+}
+
+static void amdgpu_reset_init_work(struct amdgpu_reset_work *reset_work,
+ struct amdgpu_reset_context *reset_context,
+ amdgpu_reset_work_func_t reset_work_handler)
+{
+ memcpy(&reset_work->context, reset_context, sizeof(*reset_context));
+ reset_work->handler = reset_work_handler;
+}
+
+int amdgpu_reset_schedule_work(struct amdgpu_device *adev,
+ struct amdgpu_reset_context *reset_context,
+ amdgpu_reset_work_func_t reset_work_handler)
+{
+ struct amdgpu_reset_work *reset_work;
+ int ret;
+
+ if (!reset_context || !reset_context->reset_req_dev ||
+ !reset_work_handler)
+ return -EINVAL;
+
+ ret = amdgpu_reset_domain_get_work(adev->reset_domain, &reset_work);
+
+ if (ret)
+ return ret;
+
+ if (!ret) {
+ amdgpu_reset_init_work(reset_work, reset_context,
+ reset_work_handler);
+
+ queue_work(adev->reset_domain->wq, &reset_work->work);
+
+ if (reset_context->flags & (1U << AMDGPU_RESET_SCHEDULE_NOW))
+ flush_work(&reset_work->work);
+ }
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 471d789b33a5..d1393050d3ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -27,12 +27,16 @@
#include "amdgpu.h"
#define AMDGPU_RESET_MAX_HANDLERS 5
+#define AMDGPU_MAX_RESET_WORK 5
enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_RESET_FOR_DEVICE_REMOVE = 2,
+ AMDGPU_RESET_XCP = 3,
+ AMDGPU_RESET_SCHEDULE_NOW = 4,
+ AMDGPU_RESET_CANCEL_ALL = 5,
};
struct amdgpu_reset_context {
@@ -80,13 +84,28 @@ enum amdgpu_reset_domain_type {
XGMI_HIVE
};
+typedef void (*amdgpu_reset_work_func_t)(
+ struct amdgpu_reset_context *reset_context);
+
+struct amdgpu_reset_work {
+ struct work_struct work;
+ struct amdgpu_reset_context context;
+ struct amdgpu_reset_domain *domain;
+ atomic_t in_use;
+
+ amdgpu_reset_work_func_t handler;
+};
+
struct amdgpu_reset_domain {
struct kref refcount;
struct workqueue_struct *wq;
enum amdgpu_reset_domain_type type;
+ struct amdgpu_reset_work work[AMDGPU_MAX_RESET_WORK];
+ struct work_struct clear;
struct rw_semaphore sem;
atomic_t in_gpu_reset;
atomic_t reset_res;
+ bool drain;
};
@@ -129,6 +148,9 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+int amdgpu_reset_schedule_work(struct amdgpu_device *adev,
+ struct amdgpu_reset_context *reset_context,
+ amdgpu_reset_work_func_t handler);
#define for_each_handler(i, handler, reset_ctl) \
for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \
--
2.25.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/5] drm/amdgpu: Move to reset_schedule_work
2023-08-11 6:02 [PATCH 0/5] Add work pool to reset domain Lijo Lazar
2023-08-11 6:02 ` [PATCH 1/5] drm/amdgpu: " Lijo Lazar
@ 2023-08-11 6:02 ` Lijo Lazar
2023-08-11 6:02 ` [PATCH 3/5] drm/amdgpu: Set flags to cancel all pending resets Lijo Lazar
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Lijo Lazar @ 2023-08-11 6:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Asad.Kamal, Hawking.Zhang
Move recovery handlers to schedule reset work. Make use of the workpool
in the reset domain and delete the individual work items.
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 32 +++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 -
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 -----
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 40 ++++++------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 71 +++++++++++-----------
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 -
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 38 ++++++------
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 44 ++++++--------
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 33 +++++-----
10 files changed, 118 insertions(+), 159 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2e3c7c15cb8e..4186d8342a15 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1096,8 +1096,6 @@ struct amdgpu_device {
bool scpm_enabled;
uint32_t scpm_status;
- struct work_struct reset_work;
-
bool job_hang;
bool dc_enabled;
/* Mask of active clusters */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 629ca1ad75a8..e4c5e8f68843 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -120,21 +120,10 @@ static void amdgpu_doorbell_get_kfd_info(struct amdgpu_device *adev,
}
}
-
-static void amdgpu_amdkfd_reset_work(struct work_struct *work)
+static void amdgpu_amdkfd_reset_work(struct amdgpu_reset_context *reset_context)
{
- struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
- kfd.reset_work);
-
- struct amdgpu_reset_context reset_context;
-
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+ amdgpu_device_gpu_recover(reset_context->reset_req_dev, NULL,
+ reset_context);
}
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
@@ -200,7 +189,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size;
- INIT_WORK(&adev->kfd.reset_work, amdgpu_amdkfd_reset_work);
}
}
@@ -268,9 +256,17 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev)
void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
{
- if (amdgpu_device_should_recover_gpu(adev))
- amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->kfd.reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ if (amdgpu_device_should_recover_gpu(adev)) {
+ memset(&reset_context, 0, sizeof(reset_context));
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
+ amdgpu_reset_schedule_work(adev, &reset_context,
+ amdgpu_amdkfd_reset_work);
+ }
}
int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b34418e3e006..c36501f9ae0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -102,7 +102,6 @@ struct amdgpu_kfd_dev {
int64_t vram_used[MAX_XCP];
uint64_t vram_used_aligned[MAX_XCP];
bool init_complete;
- struct work_struct reset_work;
/* HMM page migration MEMORY_DEVICE_PRIVATE mapping */
struct dev_pagemap pgmap;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9061d79cd387..3e56ccb742bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5152,21 +5152,6 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
-#if defined(CONFIG_DEBUG_FS)
- if (!amdgpu_sriov_vf(adev))
- cancel_work(&adev->reset_work);
-#endif
-
- if (adev->kfd.dev)
- cancel_work(&adev->kfd.reset_work);
-
- if (amdgpu_sriov_vf(adev))
- cancel_work(&adev->virt.flr_work);
-
- if (con && adev->ras_enabled)
- cancel_work(&con->recovery_work);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index c694b41f6461..40786b135f4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -899,6 +899,14 @@ static int amdgpu_debugfs_fence_info_show(struct seq_file *m, void *unused)
return 0;
}
+static void
+amdgpu_debugfs_reset_work(struct amdgpu_reset_context *reset_context)
+{
+ struct amdgpu_device *adev = reset_context->reset_req_dev;
+
+ amdgpu_device_gpu_recover(adev, NULL, reset_context);
+}
+
/*
* amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
*
@@ -908,6 +916,7 @@ static int gpu_recover_get(void *data, u64 *val)
{
struct amdgpu_device *adev = (struct amdgpu_device *)data;
struct drm_device *dev = adev_to_drm(adev);
+ struct amdgpu_reset_context reset_context;
int r;
r = pm_runtime_get_sync(dev->dev);
@@ -916,8 +925,14 @@ static int gpu_recover_get(void *data, u64 *val)
return 0;
}
- if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->reset_work))
- flush_work(&adev->reset_work);
+ memset(&reset_context, 0, sizeof(reset_context));
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ set_bit(AMDGPU_RESET_SCHEDULE_NOW, &reset_context.flags);
+
+ amdgpu_reset_schedule_work(adev, &reset_context,
+ amdgpu_debugfs_reset_work);
*val = atomic_read(&adev->reset_domain->reset_res);
@@ -931,22 +946,6 @@ DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get, NULL,
"%lld\n");
-static void amdgpu_debugfs_reset_work(struct work_struct *work)
-{
- struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
- reset_work);
-
- struct amdgpu_reset_context reset_context;
-
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
-}
-
#endif
void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
@@ -958,12 +957,9 @@ void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
debugfs_create_file("amdgpu_fence_info", 0444, root, adev,
&amdgpu_debugfs_fence_info_fops);
- if (!amdgpu_sriov_vf(adev)) {
-
- INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
+ if (!amdgpu_sriov_vf(adev))
debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
&amdgpu_debugfs_gpu_recover_fops);
- }
#endif
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7689395e44fd..9e8e904434f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2011,12 +2011,11 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
return ret;
}
-static void amdgpu_ras_do_recovery(struct work_struct *work)
+static void amdgpu_ras_do_recovery(struct amdgpu_reset_context *reset_context)
{
- struct amdgpu_ras *ras =
- container_of(work, struct amdgpu_ras, recovery_work);
+ struct amdgpu_device *adev = reset_context->reset_req_dev;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct amdgpu_device *remote_adev = NULL;
- struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle = NULL;
if (!ras->disable_ras_err_cnt_harvest) {
@@ -2040,37 +2039,9 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
amdgpu_put_xgmi_hive(hive);
}
- if (amdgpu_device_should_recover_gpu(ras->adev)) {
- struct amdgpu_reset_context reset_context;
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
-
- /* Perform full reset in fatal error mode */
- if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
- set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- else {
- clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
- ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
- reset_context.method = AMD_RESET_METHOD_MODE2;
- }
-
- /* Fatal error occurs in poison mode, mode1 reset is used to
- * recover gpu.
- */
- if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
- ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
- set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- psp_fatal_error_recovery_quirk(&adev->psp);
- }
- }
+ if (amdgpu_device_should_recover_gpu(ras->adev))
+ amdgpu_device_gpu_recover(ras->adev, NULL, reset_context);
- amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
- }
atomic_set(&ras->in_recovery, 0);
}
@@ -2313,7 +2284,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
}
mutex_init(&con->recovery_lock);
- INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
@@ -3160,9 +3130,38 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+
+ /* Perform full reset in fatal error mode */
+ if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) {
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ }
+ else {
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+ ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ reset_context.method = AMD_RESET_METHOD_MODE2;
+ }
+
+ /* Fatal error occurs in poison mode, mode1 reset is used to
+ * recover gpu.
+ */
+ if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
+ ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ psp_fatal_error_recovery_quirk(&adev->psp);
+ }
+ }
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
- amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
+ amdgpu_reset_schedule_work(ras->adev, &reset_context,
+ amdgpu_ras_do_recovery);
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index fabb83e9d9ae..87e0a8b918df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -237,7 +237,6 @@ struct amdgpu_virt {
uint32_t reg_val_offs;
struct amdgpu_irq_src ack_irq;
struct amdgpu_irq_src rcv_irq;
- struct work_struct flr_work;
struct amdgpu_mm_table mm_table;
const struct amdgpu_virt_ops *ops;
struct amdgpu_vf_error_buffer vf_errors;
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 63725b2ebc03..53fdf6e70ad2 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -249,10 +249,9 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
return 0;
}
-static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
+static void xgpu_ai_mailbox_flr_work(struct amdgpu_reset_context *reset_context)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
- struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+ struct amdgpu_device *adev = reset_context->reset_req_dev;
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
@@ -281,18 +280,10 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
up_write(&adev->reset_domain->sem);
/* Trigger recovery for world switch failure if no TDR */
- if (amdgpu_device_should_recover_gpu(adev)
- && (!amdgpu_device_has_job_running(adev) ||
- adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
- struct amdgpu_reset_context reset_context;
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
- }
+ if (amdgpu_device_should_recover_gpu(adev) &&
+ (!amdgpu_device_has_job_running(adev) ||
+ adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
+ amdgpu_device_gpu_recover(adev, NULL, reset_context);
}
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -314,14 +305,21 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
+ struct amdgpu_reset_context reset_context;
switch (event) {
case IDH_FLR_NOTIFICATION:
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
- WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->virt.flr_work),
- "Failed to queue work! at %s",
- __func__);
+ WARN_ONCE(!amdgpu_reset_schedule_work(
+ adev, &reset_context,
+ xgpu_ai_mailbox_flr_work),
+ "Failed to queue work! at %s", __func__);
break;
case IDH_QUERY_ALIVE:
xgpu_ai_mailbox_send_ack(adev);
@@ -388,8 +386,6 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
return r;
}
- INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
-
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6a68ee946f1c..171fe3e84ddf 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -271,10 +271,9 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
return 0;
}
-static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
+static void xgpu_nv_mailbox_flr_work(struct amdgpu_reset_context *reset_context)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
- struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+ struct amdgpu_device *adev = reset_context->reset_req_dev;
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
@@ -303,21 +302,13 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
up_write(&adev->reset_domain->sem);
/* Trigger recovery for world switch failure if no TDR */
- if (amdgpu_device_should_recover_gpu(adev)
- && (!amdgpu_device_has_job_running(adev) ||
- adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
- adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
- adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
- adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) {
- struct amdgpu_reset_context reset_context;
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
- }
+ if (amdgpu_device_should_recover_gpu(adev) &&
+ (!amdgpu_device_has_job_running(adev) ||
+ adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+ adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
+ adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
+ adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
+ amdgpu_device_gpu_recover(adev, NULL, reset_context);
}
static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -342,14 +333,21 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
enum idh_event event = xgpu_nv_mailbox_peek_msg(adev);
+ struct amdgpu_reset_context reset_context;
switch (event) {
case IDH_FLR_NOTIFICATION:
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
- WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->virt.flr_work),
- "Failed to queue work! at %s",
- __func__);
+ WARN_ONCE(!amdgpu_reset_schedule_work(
+ adev, &reset_context,
+ xgpu_nv_mailbox_flr_work),
+ "Failed to queue work! at %s", __func__);
break;
/* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
* it byfar since that polling thread will handle it,
@@ -413,8 +411,6 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
return r;
}
- INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
-
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 59f53c743362..a39805bc69c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -510,10 +510,9 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
return 0;
}
-static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
+static void xgpu_vi_mailbox_flr_work(struct amdgpu_reset_context *reset_context)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
- struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+ struct amdgpu_device *adev = reset_context->reset_req_dev;
/* wait until RCV_MSG become 3 */
if (xgpu_vi_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL)) {
@@ -522,16 +521,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
}
/* Trigger recovery due to world switch failure */
- if (amdgpu_device_should_recover_gpu(adev)) {
- struct amdgpu_reset_context reset_context;
- memset(&reset_context, 0, sizeof(reset_context));
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- amdgpu_device_gpu_recover(adev, NULL, &reset_context);
- }
+ if (amdgpu_device_should_recover_gpu(adev))
+ amdgpu_device_gpu_recover(adev, NULL, reset_context);
}
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
@@ -553,18 +544,24 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
int r;
+ struct amdgpu_reset_context reset_context;
/* trigger gpu-reset by hypervisor only if TDR disabled */
if (!amdgpu_gpu_recovery) {
/* see what event we get */
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
/* only handle FLR_NOTIFY now */
if (!r && !amdgpu_in_reset(adev))
- WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->virt.flr_work),
- "Failed to queue work! at %s",
- __func__);
+ WARN_ONCE(!amdgpu_reset_schedule_work(
+ adev, &reset_context,
+ xgpu_vi_mailbox_flr_work),
+ "Failed to queue work! at %s", __func__);
}
return 0;
@@ -618,8 +615,6 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
return r;
}
- INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
-
return 0;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 5/5] drm/amdgpu: Add TDR queue for ring
2023-08-11 6:02 [PATCH 0/5] Add work pool to reset domain Lijo Lazar
` (3 preceding siblings ...)
2023-08-11 6:02 ` [PATCH 4/5] drm/amdgpu: Add API to queue and do reset work Lijo Lazar
@ 2023-08-11 6:02 ` Lijo Lazar
2023-08-12 8:23 ` [PATCH 0/5] Add work pool to reset domain Christian König
5 siblings, 0 replies; 9+ messages in thread
From: Lijo Lazar @ 2023-08-11 6:02 UTC (permalink / raw)
To: amd-gfx; +Cc: Alexander.Deucher, Asad.Kamal, Hawking.Zhang
Add a TDR queue for rings to handle job timeouts. Ring's scheduler will
use this queue to for running job timeout handlers. Timeout handler will
then use the appropriate reset domain to handle recovery.
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 16 ++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 +++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
4 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4aee867ec59f..78db74b7a49b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2307,7 +2307,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
ring->num_hw_submission, 0,
- timeout, adev->reset_domain->wq,
+ timeout, ring->tdr_queue,
ring->sched_score, ring->name,
adev->dev);
if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..e081c0056a60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -31,6 +31,16 @@
#include "amdgpu_trace.h"
#include "amdgpu_reset.h"
+static void amdgpu_job_recover(struct amdgpu_reset_context *reset_context)
+{
+ int r;
+
+ r = amdgpu_device_gpu_recover(reset_context->reset_req_dev,
+ reset_context->job, reset_context);
+ if (r)
+ DRM_ERROR("GPU Recovery Failed: %d\n", r);
+}
+
static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
@@ -38,7 +48,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
struct amdgpu_task_info ti;
struct amdgpu_device *adev = ring->adev;
int idx;
- int r;
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
@@ -75,9 +84,8 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
- if (r)
- DRM_ERROR("GPU Recovery Failed: %d\n", r);
+ amdgpu_reset_exec_work(ring->adev, &reset_context,
+ amdgpu_job_recover);
} else {
drm_sched_suspend_timeout(&ring->sched);
if (amdgpu_sriov_vf(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 80d6e132e409..9b2e5e6e9388 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -357,6 +357,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
num_sched = &adev->gpu_sched[hw_ip][hw_prio].num_scheds;
adev->gpu_sched[hw_ip][hw_prio].sched[(*num_sched)++] =
&ring->sched;
+
+ ring->tdr_queue = create_singlethread_workqueue(ring->name);
}
return 0;
@@ -399,6 +401,9 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
if (!ring->is_mes_queue)
ring->adev->rings[ring->idx] = NULL;
+
+ if (ring->tdr_queue)
+ destroy_workqueue(ring->tdr_queue);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e3f98554bb3c..3c60af19ad05 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -299,6 +299,7 @@ struct amdgpu_ring {
bool is_sw_ring;
unsigned int entry_index;
+ struct workqueue_struct *tdr_queue;
};
#define amdgpu_ring_parse_cs(r, p, job, ib) ((r)->funcs->parse_cs((p), (job), (ib)))
--
2.25.1
^ permalink raw reply related [flat|nested] 9+ messages in thread