On Wed, Sep 24, 2025 at 11:29:22AM +0200, Michal Wajdeczko wrote:I'll let Tomasz respond to most of the commennts as it is his patch but replying to last comment.
ok, will answer.
On 9/24/2025 3:15 AM, Matthew Brost wrote:From: Tomasz Lis <tomasz.lis@intel.com> Protect access to GGTT config as this is non-static information. Signed-off-by: Matthew Brost <matthew.brost@intel.com> Signed-off-by: Tomasz Lis <tomasz.lis@intel.com> --- drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 96 ++++++++++++++++++----- drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h | 3 + drivers/gpu/drm/xe/xe_sriov_vf.c | 6 ++ 3 files changed, 84 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c index 0461d5513487..016c867e5e2b 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c @@ -440,18 +440,21 @@ static int vf_get_ggtt_info(struct xe_gt *gt) xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); + down_write(&config->lock);do we need to protect also VF2GUC communication ..+ err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_GGTT_START_KEY, &start); if (unlikely(err)) - return err; + goto out; err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_GGTT_SIZE_KEY, &size); if (unlikely(err)) - return err; + goto out;.. or just RW access to the data itself from here?
The protection must start before we receive GGTT start. It would make little sense to protect just manipulation of data which we've already received.
The post-migration recovery must have a clear state: either we got the GGTT already, or not. It cannot have a state "it's not written everywhere yet, but GuC may have sent it already, who knows".
The GGTT address must be protected from before the moment GuC reads it in order to write to VF G2H CTB.
if (config->ggtt_size && config->ggtt_size != size) { xe_gt_sriov_err(gt, "Unexpected GGTT reassignment: %lluK != %lluK\n", size / SZ_1K, config->ggtt_size / SZ_1K); - return -EREMCHG; + err = -EREMCHG; + goto out; } xe_gt_sriov_dbg_verbose(gt, "GGTT %#llx-%#llx = %lluK\n", @@ -460,8 +463,11 @@ static int vf_get_ggtt_info(struct xe_gt *gt) config->ggtt_shift = start - (s64)config->ggtt_base; config->ggtt_base = start; config->ggtt_size = size; + err = config->ggtt_size ? 0 : -ENODATA; - return config->ggtt_size ? 0 : -ENODATA; +out: + up_write(&config->lock); + return err; } static int vf_get_lmem_info(struct xe_gt *gt) @@ -474,22 +480,28 @@ static int vf_get_lmem_info(struct xe_gt *gt) xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); + down_write(&config->lock); + err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_LMEM_SIZE_KEY, &size); if (unlikely(err)) - return err; + goto out; if (config->lmem_size && config->lmem_size != size) { xe_gt_sriov_err(gt, "Unexpected LMEM reassignment: %lluM != %lluM\n", size / SZ_1M, config->lmem_size / SZ_1M); - return -EREMCHG; + err = -EREMCHG; + goto out; } string_get_size(size, 1, STRING_UNITS_2, size_str, sizeof(size_str)); xe_gt_sriov_dbg_verbose(gt, "LMEM %lluM %s\n", size / SZ_1M, size_str); config->lmem_size = size; + err = config->lmem_size ? 0 : -ENODATA; - return config->lmem_size ? 0 : -ENODATA; +out: + up_write(&config->lock); + return err; } static int vf_get_submission_cfg(struct xe_gt *gt) @@ -501,23 +513,27 @@ static int vf_get_submission_cfg(struct xe_gt *gt) xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); + down_write(&config->lock); + err = guc_action_query_single_klv32(guc, GUC_KLV_VF_CFG_NUM_CONTEXTS_KEY, &num_ctxs); if (unlikely(err)) - return err; + goto out; err = guc_action_query_single_klv32(guc, GUC_KLV_VF_CFG_NUM_DOORBELLS_KEY, &num_dbs); if (unlikely(err)) - return err; + goto out; if (config->num_ctxs && config->num_ctxs != num_ctxs) { xe_gt_sriov_err(gt, "Unexpected CTXs reassignment: %u != %u\n", num_ctxs, config->num_ctxs); - return -EREMCHG; + err = -EREMCHG; + goto out; } if (config->num_dbs && config->num_dbs != num_dbs) { xe_gt_sriov_err(gt, "Unexpected DBs reassignment: %u != %u\n", num_dbs, config->num_dbs); - return -EREMCHG; + err = -EREMCHG; + goto out; } xe_gt_sriov_dbg_verbose(gt, "CTXs %u DBs %u\n", num_ctxs, num_dbs); @@ -525,7 +541,11 @@ static int vf_get_submission_cfg(struct xe_gt *gt) config->num_ctxs = num_ctxs; config->num_dbs = num_dbs; - return config->num_ctxs ? 0 : -ENODATA; + err = config->num_ctxs ? 0 : -ENODATA; + +out: + up_write(&config->lock); + return err; } static void vf_cache_gmdid(struct xe_gt *gt) @@ -579,11 +599,18 @@ int xe_gt_sriov_vf_query_config(struct xe_gt *gt) */ u16 xe_gt_sriov_vf_guc_ids(struct xe_gt *gt) { + struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config; + u16 val; + xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); xe_gt_assert(gt, gt->sriov.vf.guc_version.major); - xe_gt_assert(gt, gt->sriov.vf.self_config.num_ctxs);the purpose of this assert was to make sure no part of the VF driver is trying to get num_ctxs before we finish querying it from the GuC ..
We now know this will never happen _before_we_finish_querying_, as there's a lock. Read side will wait for the write side to finish.
So unless you're worried about a situation before we _start_ querying, it's ok to even remove it completely.
sure, can do that.- return gt->sriov.vf.self_config.num_ctxs; + down_read(&config->lock); + xe_gt_assert(gt, config->num_ctxs);.. instead of moving this assert in the middle of the lock ..+ val = config->num_ctxs; + up_read(&config->lock); +.. for code clarity you can add assert here as: xe_gt_assert(gt, val);
+ return val; } /** @@ -596,11 +623,18 @@ u16 xe_gt_sriov_vf_guc_ids(struct xe_gt *gt) */ u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt) { + struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config; + u64 val; + xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); xe_gt_assert(gt, gt->sriov.vf.guc_version.major); - xe_gt_assert(gt, gt->sriov.vf.self_config.lmem_size);ditto
ack, though whether this stays at all, depends on further discussion in v3 of this series.
-Tomasz
- return gt->sriov.vf.self_config.lmem_size; + down_read(&config->lock); + xe_gt_assert(gt, config->lmem_size); + val = config->lmem_size; + up_read(&config->lock); + + return val; } /** @@ -613,11 +647,17 @@ u64 xe_gt_sriov_vf_lmem(struct xe_gt *gt) */ u64 xe_gt_sriov_vf_ggtt(struct xe_gt *gt) { + struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config; + u64 val; + xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); xe_gt_assert(gt, gt->sriov.vf.guc_version.major); - xe_gt_assert(gt, gt->sriov.vf.self_config.ggtt_size); - return gt->sriov.vf.self_config.ggtt_size; + down_read(&config->lock); + val = config->ggtt_size; + up_read(&config->lock); + + return val; } /** @@ -630,11 +670,18 @@ u64 xe_gt_sriov_vf_ggtt(struct xe_gt *gt) */ u64 xe_gt_sriov_vf_ggtt_base(struct xe_gt *gt) { + struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config; + u64 val; + xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); xe_gt_assert(gt, gt->sriov.vf.guc_version.major); - xe_gt_assert(gt, gt->sriov.vf.self_config.ggtt_size); - return gt->sriov.vf.self_config.ggtt_base; + down_read(&config->lock); + xe_gt_assert(gt, config->ggtt_size); + val = config->ggtt_base; + up_read(&config->lock); + + return val; } /** @@ -648,11 +695,16 @@ u64 xe_gt_sriov_vf_ggtt_base(struct xe_gt *gt) s64 xe_gt_sriov_vf_ggtt_shift(struct xe_gt *gt) { struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config; + s64 val; xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); xe_gt_assert(gt, xe_gt_is_main_type(gt)); - return config->ggtt_shift; + down_read(&config->lock); + val = config->ggtt_shift; + up_read(&config->lock); + + return val; } static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor) @@ -1044,6 +1096,7 @@ void xe_gt_sriov_vf_print_config(struct xe_gt *gt, struct drm_printer *p) xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt))); + down_read(&config->lock); drm_printf(p, "GGTT range:\t%#llx-%#llx\n", config->ggtt_base, config->ggtt_base + config->ggtt_size - 1); @@ -1060,6 +1113,7 @@ void xe_gt_sriov_vf_print_config(struct xe_gt *gt, struct drm_printer *p) drm_printf(p, "GuC contexts:\t%u\n", config->num_ctxs); drm_printf(p, "GuC doorbells:\t%u\n", config->num_dbs); + up_read(&config->lock); } /** diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h index 298dedf4b009..d95857bd789b 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf_types.h @@ -6,6 +6,7 @@ #ifndef _XE_GT_SRIOV_VF_TYPES_H_ #define _XE_GT_SRIOV_VF_TYPES_H_ +#include <linux/rwsem.h> #include <linux/types.h> #include "xe_uc_fw_types.h" @@ -25,6 +26,8 @@ struct xe_gt_sriov_vf_selfconfig { u16 num_ctxs; /** @num_dbs: assigned number of GuC doorbells IDs. */ u16 num_dbs; + /** @lock: lock for protecting access to all selfconfig fields. */ + struct rw_semaphore lock; }; /** diff --git a/drivers/gpu/drm/xe/xe_sriov_vf.c b/drivers/gpu/drm/xe/xe_sriov_vf.c index cdd9f8e78b2a..d6e2ed9b9bbc 100644 --- a/drivers/gpu/drm/xe/xe_sriov_vf.c +++ b/drivers/gpu/drm/xe/xe_sriov_vf.c @@ -197,6 +197,12 @@ static void vf_migration_init_early(struct xe_device *xe) */ void xe_sriov_vf_init_early(struct xe_device *xe) { + struct xe_gt *gt; + unsigned int id; + + for_each_gt(gt, xe, id) + init_rwsem(>->sriov.vf.self_config.lock);this is a wrong place - VF's per-GT data shall be initialized in the per-GT code somewhere in xe_gt_sriov_vf.cYes, it is moved to the proper place in [1]. Matt [1] https://patchwork.freedesktop.org/patch/676383/?series=154627&rev=2+ vf_migration_init_early(xe); }