* [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active
@ 2024-10-28 14:52 Alex Deucher
2024-10-28 14:52 ` [PATCH 2/2] drm/amdgpu: fix fairness in enforce isolation handling Alex Deucher
2024-10-28 15:19 ` [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active SRINIVASAN SHANMUGAM
0 siblings, 2 replies; 3+ messages in thread
From: Alex Deucher @ 2024-10-28 14:52 UTC (permalink / raw)
To: amd-gfx; +Cc: Alex Deucher
Add an interface to query whether KFD has any active queues.
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 9 ++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 ++++++
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 25 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
4 files changed, 42 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index b545940e512b..82f1c6a19d79 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -890,6 +890,15 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
return kgd2kfd_start_sched(adev->kfd.dev, node_id);
}
+/* check if there are KFD queues active */
+bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id)
+{
+ if (!adev->kfd.init_complete)
+ return 0;
+
+ return kgd2kfd_compute_active(adev->kfd.dev, node_id);
+}
+
/* Config CGTT_SQ_CLK_CTRL */
int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id,
bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 7e0a22072536..f47f4555437b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -268,6 +268,7 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);
int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id,
bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable);
+bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id);
/* Read user wptr from a specified user address space with page fault
@@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void);
void kgd2kfd_unlock_kfd(void);
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
+bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
#else
static inline int kgd2kfd_init(void)
{
@@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
return 0;
}
+
+bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
+{
+ return false;
+}
#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index b1ce73c7a9ab..9d8bdead6e65 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1404,6 +1404,13 @@ void kfd_dec_compute_active(struct kfd_node *node)
WARN_ONCE(count < 0, "Compute profile ref. count error");
}
+bool kfd_compute_active(struct kfd_node *node)
+{
+ if (atomic_read(&node->kfd->compute_profile))
+ return true;
+ return false;
+}
+
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask)
{
/*
@@ -1497,6 +1504,24 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
return node->dqm->ops.halt(node->dqm);
}
+bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
+{
+ struct kfd_node *node;
+
+ if (!kfd->init_complete)
+ return false;
+
+ if (node_id >= kfd->num_nodes) {
+ dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
+ node_id, kfd->num_nodes - 1);
+ return false;
+ }
+
+ node = kfd->nodes[node_id];
+
+ return kfd_compute_active(node);
+}
+
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9e5ca0b93b2a..f9140fdb8132 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1518,6 +1518,7 @@ bool kfd_is_locked(void);
/* Compute profile */
void kfd_inc_compute_active(struct kfd_node *dev);
void kfd_dec_compute_active(struct kfd_node *dev);
+bool kfd_compute_active(struct kfd_node *dev);
/* Cgroup Support */
/* Check with device cgroup if @kfd device is accessible */
--
2.47.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* [PATCH 2/2] drm/amdgpu: fix fairness in enforce isolation handling
2024-10-28 14:52 [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active Alex Deucher
@ 2024-10-28 14:52 ` Alex Deucher
2024-10-28 15:19 ` [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active SRINIVASAN SHANMUGAM
1 sibling, 0 replies; 3+ messages in thread
From: Alex Deucher @ 2024-10-28 14:52 UTC (permalink / raw)
To: amd-gfx; +Cc: Alex Deucher
Make sure KFD gets a turn when serializing access to
the GC IP. Currently non-KFD jobs can starve KFD if they
submit often enough. This patch prevents that by stalling
non-KFD if its time period has elapsed.
v2: fix units
v3: check enablement properly
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 53 ++++++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 +
3 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index ab5524b7a259..2f381848c849 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -119,7 +119,7 @@
#define MAX_GPU_INSTANCE 64
-#define GFX_SLICE_PERIOD msecs_to_jiffies(250)
+#define GFX_SLICE_PERIOD_MS 250
struct amdgpu_gpu_instance {
struct amdgpu_device *adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..b8cc4b146bdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1752,7 +1752,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
adev->gfx.kfd_sch_inactive[idx]) {
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
- GFX_SLICE_PERIOD);
+ msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx]));
}
} else {
if (adev->gfx.kfd_sch_req_count[idx] == 0) {
@@ -1807,8 +1807,9 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]);
}
if (fences) {
+ /* we've already had our timeslice, so let's wrap this up */
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
- GFX_SLICE_PERIOD);
+ msecs_to_jiffies(1));
} else {
/* Tell KFD to resume the runqueue */
if (adev->kfd.init_complete) {
@@ -1821,6 +1822,51 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
mutex_unlock(&adev->enforce_isolation_mutex);
}
+static void
+amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
+ u32 idx)
+{
+ unsigned long cjiffies;
+ bool wait = false;
+
+ mutex_lock(&adev->enforce_isolation_mutex);
+ if (adev->enforce_isolation[idx]) {
+ /* set the initial values if nothing is set */
+ if (!adev->gfx.enforce_isolation_jiffies[idx]) {
+ adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
+ adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
+ }
+ /* Make sure KFD gets a chance to run */
+ if (amdgpu_amdkfd_compute_active(adev, idx)) {
+ cjiffies = jiffies;
+ if (time_after(cjiffies, adev->gfx.enforce_isolation_jiffies[idx])) {
+ cjiffies -= adev->gfx.enforce_isolation_jiffies[idx];
+ if ((jiffies_to_msecs(cjiffies) >= GFX_SLICE_PERIOD_MS)) {
+ /* if our time is up, let KGD work drain before scheduling more */
+ wait = true;
+ /* reset the timer period */
+ adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
+ } else {
+ /* set the timer period to what's left in our time slice */
+ adev->gfx.enforce_isolation_time[idx] =
+ GFX_SLICE_PERIOD_MS - jiffies_to_msecs(cjiffies);
+ }
+ } else {
+ /* if jiffies wrap around we will just wait a little longer */
+ adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
+ }
+ } else {
+ /* if there is no KFD work, then set the full slice period */
+ adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
+ adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
+ }
+ }
+ mutex_unlock(&adev->enforce_isolation_mutex);
+
+ if (wait)
+ msleep(GFX_SLICE_PERIOD_MS);
+}
+
void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
@@ -1837,6 +1883,9 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
if (idx >= MAX_XCP)
return;
+ /* Don't submit more work until KFD has had some time */
+ amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
+
mutex_lock(&adev->enforce_isolation_mutex);
if (adev->enforce_isolation[idx]) {
if (adev->kfd.init_complete)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index f710178a21bc..af9dbd760fee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -472,6 +472,8 @@ struct amdgpu_gfx {
struct mutex kfd_sch_mutex;
u64 kfd_sch_req_count[MAX_XCP];
bool kfd_sch_inactive[MAX_XCP];
+ unsigned long enforce_isolation_jiffies[MAX_XCP];
+ unsigned long enforce_isolation_time[MAX_XCP];
};
struct amdgpu_gfx_ras_reg_entry {
--
2.47.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active
2024-10-28 14:52 [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active Alex Deucher
2024-10-28 14:52 ` [PATCH 2/2] drm/amdgpu: fix fairness in enforce isolation handling Alex Deucher
@ 2024-10-28 15:19 ` SRINIVASAN SHANMUGAM
1 sibling, 0 replies; 3+ messages in thread
From: SRINIVASAN SHANMUGAM @ 2024-10-28 15:19 UTC (permalink / raw)
To: Alex Deucher, amd-gfx
The series is:
Acked-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
On 10/28/2024 8:22 PM, Alex Deucher wrote:
> Add an interface to query whether KFD has any active queues.
>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 9 ++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 ++++++
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 25 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> 4 files changed, 42 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index b545940e512b..82f1c6a19d79 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -890,6 +890,15 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
> return kgd2kfd_start_sched(adev->kfd.dev, node_id);
> }
>
> +/* check if there are KFD queues active */
> +bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id)
> +{
> + if (!adev->kfd.init_complete)
> + return 0;
> +
> + return kgd2kfd_compute_active(adev->kfd.dev, node_id);
> +}
> +
> /* Config CGTT_SQ_CLK_CTRL */
> int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id,
> bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 7e0a22072536..f47f4555437b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -268,6 +268,7 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
> int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);
> int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id,
> bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable);
> +bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id);
>
>
> /* Read user wptr from a specified user address space with page fault
> @@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void);
> void kgd2kfd_unlock_kfd(void);
> int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
> int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
> +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
> #else
> static inline int kgd2kfd_init(void)
> {
> @@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
> {
> return 0;
> }
> +
> +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
> +{
> + return false;
> +}
> #endif
> #endif /* AMDGPU_AMDKFD_H_INCLUDED */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index b1ce73c7a9ab..9d8bdead6e65 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1404,6 +1404,13 @@ void kfd_dec_compute_active(struct kfd_node *node)
> WARN_ONCE(count < 0, "Compute profile ref. count error");
> }
>
> +bool kfd_compute_active(struct kfd_node *node)
> +{
> + if (atomic_read(&node->kfd->compute_profile))
> + return true;
> + return false;
> +}
> +
> void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask)
> {
> /*
> @@ -1497,6 +1504,24 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
> return node->dqm->ops.halt(node->dqm);
> }
>
> +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
> +{
> + struct kfd_node *node;
> +
> + if (!kfd->init_complete)
> + return false;
> +
> + if (node_id >= kfd->num_nodes) {
> + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
> + node_id, kfd->num_nodes - 1);
> + return false;
> + }
> +
> + node = kfd->nodes[node_id];
> +
> + return kfd_compute_active(node);
> +}
> +
> #if defined(CONFIG_DEBUG_FS)
>
> /* This function will send a package to HIQ to hang the HWS
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 9e5ca0b93b2a..f9140fdb8132 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1518,6 +1518,7 @@ bool kfd_is_locked(void);
> /* Compute profile */
> void kfd_inc_compute_active(struct kfd_node *dev);
> void kfd_dec_compute_active(struct kfd_node *dev);
> +bool kfd_compute_active(struct kfd_node *dev);
>
> /* Cgroup Support */
> /* Check with device cgroup if @kfd device is accessible */
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-10-28 15:19 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-28 14:52 [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active Alex Deucher
2024-10-28 14:52 ` [PATCH 2/2] drm/amdgpu: fix fairness in enforce isolation handling Alex Deucher
2024-10-28 15:19 ` [PATCH 1/2] drm/amdkfd: add an interface to query whether is KFD is active SRINIVASAN SHANMUGAM
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox