Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 6.12 183/486] drm/xe: Nuke VM's mapping upon close
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
@ 2025-05-05 22:34 ` Sasha Levin
  2025-05-05 22:34 ` [PATCH AUTOSEL 6.12 184/486] drm/xe: Retry BO allocation Sasha Levin
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:34 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Matthew Brost, Thomas Hellström, Sasha Levin, lucas.demarchi,
	rodrigo.vivi, airlied, simona, intel-xe, dri-devel

From: Matthew Brost <matthew.brost@intel.com>

[ Upstream commit 074e40d9c2a84939fe28d7121d3469db50f34a3d ]

Clear root PT entry and invalidate entire VM's address space when
closing the VM. Will prevent the GPU from accessing any of the VM's
memory after closing.

v2:
 - s/vma/vm in kernel doc (CI)
 - Don't nuke migration VM as this occur at driver unload (CI)
v3:
 - Rebase and pull into SVM series (Thomas)
 - Wait for pending binds (Thomas)
v5:
 - Remove xe_gt_tlb_invalidation_fence_fini in error case (Matt Auld)
 - Drop local migration bool (Thomas)
v7:
 - Add drm_dev_enter/exit protecting invalidation (CI, Matt Auld)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250306012657.3505757-12-matthew.brost@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 22 ++++++++++++++
 drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h |  2 ++
 drivers/gpu/drm/xe/xe_pt.c                  | 14 +++++++++
 drivers/gpu/drm/xe/xe_pt.h                  |  3 ++
 drivers/gpu/drm/xe/xe_vm.c                  | 32 +++++++++++++++++++++
 5 files changed, 73 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
index 98a450271f5ce..3155825fa46ad 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -406,6 +406,28 @@ int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
 	return send_tlb_invalidation(&gt->uc.guc, fence, action, len);
 }
 
+/**
+ * xe_gt_tlb_invalidation_vm - Issue a TLB invalidation on this GT for a VM
+ * @gt: graphics tile
+ * @vm: VM to invalidate
+ *
+ * Invalidate entire VM's address space
+ */
+void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm)
+{
+	struct xe_gt_tlb_invalidation_fence fence;
+	u64 range = 1ull << vm->xe->info.va_bits;
+	int ret;
+
+	xe_gt_tlb_invalidation_fence_init(gt, &fence, true);
+
+	ret = xe_gt_tlb_invalidation_range(gt, &fence, 0, range, vm->usm.asid);
+	if (ret < 0)
+		return;
+
+	xe_gt_tlb_invalidation_fence_wait(&fence);
+}
+
 /**
  * xe_gt_tlb_invalidation_vma - Issue a TLB invalidation on this GT for a VMA
  * @gt: graphics tile
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
index 672acfcdf0d70..abe9b03d543e6 100644
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
@@ -12,6 +12,7 @@
 
 struct xe_gt;
 struct xe_guc;
+struct xe_vm;
 struct xe_vma;
 
 int xe_gt_tlb_invalidation_init_early(struct xe_gt *gt);
@@ -21,6 +22,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt);
 int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
 			       struct xe_gt_tlb_invalidation_fence *fence,
 			       struct xe_vma *vma);
+void xe_gt_tlb_invalidation_vm(struct xe_gt *gt, struct xe_vm *vm);
 int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
 				 struct xe_gt_tlb_invalidation_fence *fence,
 				 u64 start, u64 end, u32 asid);
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index 230cf47fb9c5e..fb94ff55c7361 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -217,6 +217,20 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred)
 	xe_pt_free(pt);
 }
 
+/**
+ * xe_pt_clear() - Clear a page-table.
+ * @xe: xe device.
+ * @pt: The page-table.
+ *
+ * Clears page-table by setting to zero.
+ */
+void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt)
+{
+	struct iosys_map *map = &pt->bo->vmap;
+
+	xe_map_memset(xe, map, 0, 0, SZ_4K);
+}
+
 /**
  * DOC: Pagetable building
  *
diff --git a/drivers/gpu/drm/xe/xe_pt.h b/drivers/gpu/drm/xe/xe_pt.h
index 9ab386431cadd..8e43912ae8e94 100644
--- a/drivers/gpu/drm/xe/xe_pt.h
+++ b/drivers/gpu/drm/xe/xe_pt.h
@@ -13,6 +13,7 @@ struct dma_fence;
 struct xe_bo;
 struct xe_device;
 struct xe_exec_queue;
+struct xe_svm_range;
 struct xe_sync_entry;
 struct xe_tile;
 struct xe_vm;
@@ -35,6 +36,8 @@ void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm,
 
 void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred);
 
+void xe_pt_clear(struct xe_device *xe, struct xe_pt *pt);
+
 int xe_pt_update_ops_prepare(struct xe_tile *tile, struct xe_vma_ops *vops);
 struct dma_fence *xe_pt_update_ops_run(struct xe_tile *tile,
 				       struct xe_vma_ops *vops);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 872de052d670f..de257a032225f 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -8,6 +8,7 @@
 #include <linux/dma-fence-array.h>
 #include <linux/nospec.h>
 
+#include <drm/drm_drv.h>
 #include <drm/drm_exec.h>
 #include <drm/drm_print.h>
 #include <drm/ttm/ttm_execbuf_util.h>
@@ -1581,9 +1582,40 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 
 static void xe_vm_close(struct xe_vm *vm)
 {
+	struct xe_device *xe = vm->xe;
+	bool bound;
+	int idx;
+
+	bound = drm_dev_enter(&xe->drm, &idx);
+
 	down_write(&vm->lock);
+
 	vm->size = 0;
+
+	if (!((vm->flags & XE_VM_FLAG_MIGRATION))) {
+		struct xe_tile *tile;
+		struct xe_gt *gt;
+		u8 id;
+
+		/* Wait for pending binds */
+		dma_resv_wait_timeout(xe_vm_resv(vm),
+				      DMA_RESV_USAGE_BOOKKEEP,
+				      false, MAX_SCHEDULE_TIMEOUT);
+
+		if (bound) {
+			for_each_tile(tile, xe, id)
+				if (vm->pt_root[id])
+					xe_pt_clear(xe, vm->pt_root[id]);
+
+			for_each_gt(gt, xe, id)
+				xe_gt_tlb_invalidation_vm(gt, vm);
+		}
+	}
+
 	up_write(&vm->lock);
+
+	if (bound)
+		drm_dev_exit(idx);
 }
 
 void xe_vm_close_and_put(struct xe_vm *vm)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 184/486] drm/xe: Retry BO allocation
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
  2025-05-05 22:34 ` [PATCH AUTOSEL 6.12 183/486] drm/xe: Nuke VM's mapping upon close Sasha Levin
@ 2025-05-05 22:34 ` Sasha Levin
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 236/486] drm/xe/vf: Retry sending MMIO request to GUC on timeout error Sasha Levin
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:34 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Matthew Brost, Gwan-gyeong Mun, Stuart Summers, Sasha Levin,
	lucas.demarchi, thomas.hellstrom, rodrigo.vivi, airlied, simona,
	intel-xe, dri-devel

From: Matthew Brost <matthew.brost@intel.com>

[ Upstream commit 1d724a2f1b2c3f0cba4975784a808482e0631adf ]

TTM doesn't support fair eviction via WW locking, this mitigated in by
using retry loops in exec and preempt rebind worker. Extend this retry
loop to BO allocation. Once TTM supports fair eviction this patch can be
reverted.

v4:
 - Keep line break (Stuart)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Reviewed-by: Stuart Summers <stuart.summers@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250306012657.3505757-2-matthew.brost@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_bo.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 84e327b569252..35a8242a9f541 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -1975,6 +1975,7 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
 	struct xe_file *xef = to_xe_file(file);
 	struct drm_xe_gem_create *args = data;
 	struct xe_vm *vm = NULL;
+	ktime_t end = 0;
 	struct xe_bo *bo;
 	unsigned int bo_flags;
 	u32 handle;
@@ -2047,6 +2048,10 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
 		vm = xe_vm_lookup(xef, args->vm_id);
 		if (XE_IOCTL_DBG(xe, !vm))
 			return -ENOENT;
+	}
+
+retry:
+	if (vm) {
 		err = xe_vm_lock(vm, true);
 		if (err)
 			goto out_vm;
@@ -2060,6 +2065,8 @@ int xe_gem_create_ioctl(struct drm_device *dev, void *data,
 
 	if (IS_ERR(bo)) {
 		err = PTR_ERR(bo);
+		if (xe_vm_validate_should_retry(NULL, err, &end))
+			goto retry;
 		goto out_vm;
 	}
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 236/486] drm/xe/vf: Retry sending MMIO request to GUC on timeout error
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
  2025-05-05 22:34 ` [PATCH AUTOSEL 6.12 183/486] drm/xe: Nuke VM's mapping upon close Sasha Levin
  2025-05-05 22:34 ` [PATCH AUTOSEL 6.12 184/486] drm/xe: Retry BO allocation Sasha Levin
@ 2025-05-05 22:35 ` Sasha Levin
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 237/486] drm/xe/pf: Create a link between PF and VF devices Sasha Levin
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:35 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Satyanarayana K V P, Michał Wajdeczko, Michał Winiarski,
	Piotr Piórkowski, Sasha Levin, lucas.demarchi,
	thomas.hellstrom, rodrigo.vivi, airlied, simona, intel-xe,
	dri-devel

From: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>

[ Upstream commit ba757a65d2a28d46a8ccf50538f4f05036983f1b ]

Add support to allow retrying the sending of MMIO requests
from the VF to the GUC in the event of an error. During the
suspend/resume process, VFs begin resuming only after the PF has
resumed. Although the PF resumes, the GUC reset and provisioning
occur later in a separate worker process.

When there are a large number of VFs, some may attempt to resume
before the PF has completed its provisioning. Therefore, if a
MMIO request from a VF fails during this period, we will retry
sending the request up to GUC_RESET_VF_STATE_RETRY_MAX times,
which is set to a maximum of 10 attempts.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michał Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Piotr Piórkowski <piotr.piorkowski@intel.com>
Reviewed-by: Piotr Piorkowski <piotr.piorkowski@intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250224102807.11065-3-satyanarayana.k.v.p@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
index f982d6f9f218d..7ddbfeaf494ac 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -46,12 +46,19 @@ static int guc_action_vf_reset(struct xe_guc *guc)
 	return ret > 0 ? -EPROTO : ret;
 }
 
+#define GUC_RESET_VF_STATE_RETRY_MAX	10
 static int vf_reset_guc_state(struct xe_gt *gt)
 {
+	unsigned int retry = GUC_RESET_VF_STATE_RETRY_MAX;
 	struct xe_guc *guc = &gt->uc.guc;
 	int err;
 
-	err = guc_action_vf_reset(guc);
+	do {
+		err = guc_action_vf_reset(guc);
+		if (!err || err != -ETIMEDOUT)
+			break;
+	} while (--retry);
+
 	if (unlikely(err))
 		xe_gt_sriov_err(gt, "Failed to reset GuC state (%pe)\n", ERR_PTR(err));
 	return err;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 237/486] drm/xe/pf: Create a link between PF and VF devices
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (2 preceding siblings ...)
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 236/486] drm/xe/vf: Retry sending MMIO request to GUC on timeout error Sasha Levin
@ 2025-05-05 22:35 ` Sasha Levin
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 242/486] drm/xe: xe_gen_wa_oob: replace program_invocation_short_name Sasha Levin
                   ` (11 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:35 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Satyanarayana K V P, Michał Wajdeczko, Michał Winiarski,
	Piotr Piórkowski, Sasha Levin, lucas.demarchi,
	thomas.hellstrom, rodrigo.vivi, airlied, simona, intel-xe,
	dri-devel

From: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>

[ Upstream commit 8c0aff7d92e2be25717669eb65a81a89740a24f2 ]

When both PF and VF devices are enabled on the host, they
resume simultaneously during system resume.

However, the PF must finish provisioning the VF before any
VFs can successfully resume.

Establish a parent-child device link between the PF and VF
devices to ensure the correct order of resumption.

V4 -> V5:
- Added missing break in the error condition.
V3 -> V4:
- Made xe_pci_pf_get_vf_dev() as a static function and updated
  input parameter types.
- Updated xe_sriov_warn() to xe_sriov_abort() when VF device
  cannot be found.
V2 -> V3:
- Added function documentation for xe_pci_pf_get_vf_dev().
- Added assertion if not called from PF.
V1 -> V2:
- Added a helper function to get VF pci_dev.
- Updated xe_sriov_notice() to xe_sriov_warn() if vf pci_dev
  is not found.

Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michał Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Piotr Piórkowski <piotr.piorkowski@intel.com>
Reviewed-by: Piotr Piorkowski <piotr.piorkowski@intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250224102807.11065-2-satyanarayana.k.v.p@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_pci_sriov.c | 51 +++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_pci_sriov.c b/drivers/gpu/drm/xe/xe_pci_sriov.c
index aaceee748287e..09ee8a06fe2ed 100644
--- a/drivers/gpu/drm/xe/xe_pci_sriov.c
+++ b/drivers/gpu/drm/xe/xe_pci_sriov.c
@@ -62,6 +62,55 @@ static void pf_reset_vfs(struct xe_device *xe, unsigned int num_vfs)
 			xe_gt_sriov_pf_control_trigger_flr(gt, n);
 }
 
+static struct pci_dev *xe_pci_pf_get_vf_dev(struct xe_device *xe, unsigned int vf_id)
+{
+	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+
+	xe_assert(xe, IS_SRIOV_PF(xe));
+
+	/* caller must use pci_dev_put() */
+	return pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
+			pdev->bus->number,
+			pci_iov_virtfn_devfn(pdev, vf_id));
+}
+
+static void pf_link_vfs(struct xe_device *xe, int num_vfs)
+{
+	struct pci_dev *pdev_pf = to_pci_dev(xe->drm.dev);
+	struct device_link *link;
+	struct pci_dev *pdev_vf;
+	unsigned int n;
+
+	/*
+	 * When both PF and VF devices are enabled on the host, during system
+	 * resume they are resuming in parallel.
+	 *
+	 * But PF has to complete the provision of VF first to allow any VFs to
+	 * successfully resume.
+	 *
+	 * Create a parent-child device link between PF and VF devices that will
+	 * enforce correct resume order.
+	 */
+	for (n = 1; n <= num_vfs; n++) {
+		pdev_vf = xe_pci_pf_get_vf_dev(xe, n - 1);
+
+		/* unlikely, something weird is happening, abort */
+		if (!pdev_vf) {
+			xe_sriov_err(xe, "Cannot find VF%u device, aborting link%s creation!\n",
+				     n, str_plural(num_vfs));
+			break;
+		}
+
+		link = device_link_add(&pdev_vf->dev, &pdev_pf->dev,
+				       DL_FLAG_AUTOREMOVE_CONSUMER);
+		/* unlikely and harmless, continue with other VFs */
+		if (!link)
+			xe_sriov_notice(xe, "Failed linking VF%u\n", n);
+
+		pci_dev_put(pdev_vf);
+	}
+}
+
 static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
 {
 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
@@ -92,6 +141,8 @@ static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
 	if (err < 0)
 		goto failed;
 
+	pf_link_vfs(xe, num_vfs);
+
 	xe_sriov_info(xe, "Enabled %u of %u VF%s\n",
 		      num_vfs, total_vfs, str_plural(total_vfs));
 	return num_vfs;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 242/486] drm/xe: xe_gen_wa_oob: replace program_invocation_short_name
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (3 preceding siblings ...)
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 237/486] drm/xe/pf: Create a link between PF and VF devices Sasha Levin
@ 2025-05-05 22:35 ` Sasha Levin
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 275/486] drm/xe/oa: Ensure that polled read returns latest data Sasha Levin
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:35 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Daniel Gomez, Masahiro Yamada, Lucas De Marchi, Sasha Levin,
	thomas.hellstrom, rodrigo.vivi, airlied, simona, intel-xe,
	dri-devel

From: Daniel Gomez <da.gomez@samsung.com>

[ Upstream commit 89eb42b5539f6ae6a0cabcb39e5b6fcc83c106a1 ]

program_invocation_short_name may not be available in other systems.
Instead, replace it with the argv[0] to pass the executable name.

Fixes build error when program_invocation_short_name is not available:

drivers/gpu/drm/xe/xe_gen_wa_oob.c:34:3: error: use of
undeclared identifier 'program_invocation_short_name'    34 |
program_invocation_short_name);       |                 ^ 1 error
generated.

Suggested-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250224-macos-build-support-xe-v3-1-d2c9ed3a27cc@samsung.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_gen_wa_oob.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gen_wa_oob.c b/drivers/gpu/drm/xe/xe_gen_wa_oob.c
index 904cf47925aa1..ed9183599e31c 100644
--- a/drivers/gpu/drm/xe/xe_gen_wa_oob.c
+++ b/drivers/gpu/drm/xe/xe_gen_wa_oob.c
@@ -28,10 +28,10 @@
 	"\n" \
 	"#endif\n"
 
-static void print_usage(FILE *f)
+static void print_usage(FILE *f, const char *progname)
 {
 	fprintf(f, "usage: %s <input-rule-file> <generated-c-source-file> <generated-c-header-file>\n",
-		program_invocation_short_name);
+		progname);
 }
 
 static void print_parse_error(const char *err_msg, const char *line,
@@ -144,7 +144,7 @@ int main(int argc, const char *argv[])
 
 	if (argc < 3) {
 		fprintf(stderr, "ERROR: wrong arguments\n");
-		print_usage(stderr);
+		print_usage(stderr, argv[0]);
 		return 1;
 	}
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 275/486] drm/xe/oa: Ensure that polled read returns latest data
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (4 preceding siblings ...)
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 242/486] drm/xe: xe_gen_wa_oob: replace program_invocation_short_name Sasha Levin
@ 2025-05-05 22:35 ` Sasha Levin
  2025-05-05 22:36 ` [PATCH AUTOSEL 6.12 343/486] drm/xe: Stop ignoring errors from xe_ttm_stolen_mgr_init() Sasha Levin
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:35 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Umesh Nerlige Ramappa, Ashutosh Dixit, Sasha Levin,
	lucas.demarchi, thomas.hellstrom, rodrigo.vivi, airlied, simona,
	intel-xe, dri-devel

From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>

[ Upstream commit 98c9d27ab30aa9c6451d3a34e6e297171f273e51 ]

In polled mode, user calls poll() for read data to be available before
performing a read(). In the duration between these 2 calls, there may be
new data available in the OA buffer. To ensure user reads all available
data, check for latest data in the OA buffer in polled read.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250212010255.1423343-1-umesh.nerlige.ramappa@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_oa.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
index 448766033690c..d306ed0a04434 100644
--- a/drivers/gpu/drm/xe/xe_oa.c
+++ b/drivers/gpu/drm/xe/xe_oa.c
@@ -535,6 +535,7 @@ static ssize_t xe_oa_read(struct file *file, char __user *buf,
 			mutex_unlock(&stream->stream_lock);
 		} while (!offset && !ret);
 	} else {
+		xe_oa_buffer_check_unlocked(stream);
 		mutex_lock(&stream->stream_lock);
 		ret = __xe_oa_read(stream, buf, count, &offset);
 		mutex_unlock(&stream->stream_lock);
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 343/486] drm/xe: Stop ignoring errors from xe_ttm_stolen_mgr_init()
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (5 preceding siblings ...)
  2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 275/486] drm/xe/oa: Ensure that polled read returns latest data Sasha Levin
@ 2025-05-05 22:36 ` Sasha Levin
  2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 344/486] drm/xe: Fix xe_tile_init_noalloc() error propagation Sasha Levin
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:36 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Lucas De Marchi, Francois Dugast, Himal Prasad Ghimiray,
	Sasha Levin, thomas.hellstrom, rodrigo.vivi, airlied, simona,
	intel-xe, dri-devel

From: Lucas De Marchi <lucas.demarchi@intel.com>

[ Upstream commit ff57025c358603555f1e0ae0d50282a460433594 ]

Make sure to differentiate normal behavior, e.g. there's no stolen, from
allocation errors or failure to initialize lower layers.

Reviewed-by: Francois Dugast <francois.dugast@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250213192909.996148-5-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_device.c         |  4 +++-
 drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c | 17 +++++++++--------
 drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h |  2 +-
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index bb85208cf1a94..5c37bed3c948f 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -694,7 +694,9 @@ int xe_device_probe(struct xe_device *xe)
 	}
 
 	/* Allocate and map stolen after potential VRAM resize */
-	xe_ttm_stolen_mgr_init(xe);
+	err = xe_ttm_stolen_mgr_init(xe);
+	if (err)
+		return err;
 
 	/*
 	 * Now that GT is initialized (TTM in particular),
diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c
index f7113cf6109d5..ef84fa757b26f 100644
--- a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c
@@ -201,17 +201,16 @@ static u64 detect_stolen(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr)
 #endif
 }
 
-void xe_ttm_stolen_mgr_init(struct xe_device *xe)
+int xe_ttm_stolen_mgr_init(struct xe_device *xe)
 {
-	struct xe_ttm_stolen_mgr *mgr = drmm_kzalloc(&xe->drm, sizeof(*mgr), GFP_KERNEL);
 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+	struct xe_ttm_stolen_mgr *mgr;
 	u64 stolen_size, io_size;
 	int err;
 
-	if (!mgr) {
-		drm_dbg_kms(&xe->drm, "Stolen mgr init failed\n");
-		return;
-	}
+	mgr = drmm_kzalloc(&xe->drm, sizeof(*mgr), GFP_KERNEL);
+	if (!mgr)
+		return -ENOMEM;
 
 	if (IS_SRIOV_VF(xe))
 		stolen_size = 0;
@@ -224,7 +223,7 @@ void xe_ttm_stolen_mgr_init(struct xe_device *xe)
 
 	if (!stolen_size) {
 		drm_dbg_kms(&xe->drm, "No stolen memory support\n");
-		return;
+		return 0;
 	}
 
 	/*
@@ -240,7 +239,7 @@ void xe_ttm_stolen_mgr_init(struct xe_device *xe)
 				     io_size, PAGE_SIZE);
 	if (err) {
 		drm_dbg_kms(&xe->drm, "Stolen mgr init failed: %i\n", err);
-		return;
+		return err;
 	}
 
 	drm_dbg_kms(&xe->drm, "Initialized stolen memory support with %llu bytes\n",
@@ -248,6 +247,8 @@ void xe_ttm_stolen_mgr_init(struct xe_device *xe)
 
 	if (io_size)
 		mgr->mapping = devm_ioremap_wc(&pdev->dev, mgr->io_base, io_size);
+
+	return 0;
 }
 
 u64 xe_ttm_stolen_io_offset(struct xe_bo *bo, u32 offset)
diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h
index 1777245ff8101..8e877d1e839bd 100644
--- a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h
+++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h
@@ -12,7 +12,7 @@ struct ttm_resource;
 struct xe_bo;
 struct xe_device;
 
-void xe_ttm_stolen_mgr_init(struct xe_device *xe);
+int xe_ttm_stolen_mgr_init(struct xe_device *xe);
 int xe_ttm_stolen_io_mem_reserve(struct xe_device *xe, struct ttm_resource *mem);
 bool xe_ttm_stolen_cpu_access_needs_ggtt(struct xe_device *xe);
 u64 xe_ttm_stolen_io_offset(struct xe_bo *bo, u32 offset);
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 344/486] drm/xe: Fix xe_tile_init_noalloc() error propagation
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (6 preceding siblings ...)
  2025-05-05 22:36 ` [PATCH AUTOSEL 6.12 343/486] drm/xe: Stop ignoring errors from xe_ttm_stolen_mgr_init() Sasha Levin
@ 2025-05-05 22:37 ` Sasha Levin
  2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 347/486] drm/xe/debugfs: fixed the return value of wedged_mode_set Sasha Levin
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Lucas De Marchi, Francois Dugast, Himal Prasad Ghimiray,
	Sasha Levin, thomas.hellstrom, rodrigo.vivi, airlied, simona,
	intel-xe, dri-devel

From: Lucas De Marchi <lucas.demarchi@intel.com>

[ Upstream commit 0bcf41171c64234e79eb3552d00f0aad8a47e8d3 ]

Propagate the error to the caller so initialization properly stops if
sysfs creation fails.

Reviewed-by: Francois Dugast <francois.dugast@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250213192909.996148-4-lucas.demarchi@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_tile.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index dda5268507d8e..349beddf9b383 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -173,9 +173,7 @@ int xe_tile_init_noalloc(struct xe_tile *tile)
 
 	xe_wa_apply_tile_workarounds(tile);
 
-	err = xe_tile_sysfs_init(tile);
-
-	return 0;
+	return xe_tile_sysfs_init(tile);
 }
 
 void xe_tile_migrate_wait(struct xe_tile *tile)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 347/486] drm/xe/debugfs: fixed the return value of wedged_mode_set
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (7 preceding siblings ...)
  2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 344/486] drm/xe: Fix xe_tile_init_noalloc() error propagation Sasha Levin
@ 2025-05-05 22:37 ` Sasha Levin
  2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 348/486] drm/xe/debugfs: Add missing xe_pm_runtime_put in wedge_mode_set Sasha Levin
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Xin Wang, Rodrigo Vivi, Fei Yang, Shuicheng Lin, Sasha Levin,
	lucas.demarchi, thomas.hellstrom, airlied, simona, intel-xe,
	dri-devel

From: Xin Wang <x.wang@intel.com>

[ Upstream commit 6884d2051011f4db9e2f0b85709c79a8ced13bd6 ]

It is generally expected that the write() function should return a
positive value indicating the number of bytes written or a negative
error code if an error occurs. Returning 0 is unusual and can lead
to unexpected behavior.

When the user program writes the same value to wedged_mode twice in
a row, a lockup will occur, because the value expected to be
returned by the write() function inside the program should be equal
to the actual written value instead of 0.

To reproduce the issue:
echo 1 > /sys/kernel/debug/dri/0/wedged_mode
echo 1 > /sys/kernel/debug/dri/0/wedged_mode   <- lockup here

Signed-off-by: Xin Wang <x.wang@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Fei Yang <fei.yang@intel.com>
Cc: Shuicheng Lin <shuicheng.lin@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250213223615.2327367-1-x.wang@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index fe4319eb13fdf..051a37e477f4c 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -147,7 +147,7 @@ static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
 		return -EINVAL;
 
 	if (xe->wedged.mode == wedged_mode)
-		return 0;
+		return size;
 
 	xe->wedged.mode = wedged_mode;
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 348/486] drm/xe/debugfs: Add missing xe_pm_runtime_put in wedge_mode_set
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (8 preceding siblings ...)
  2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 347/486] drm/xe/debugfs: fixed the return value of wedged_mode_set Sasha Levin
@ 2025-05-05 22:37 ` Sasha Levin
  2025-05-05 22:38 ` [PATCH AUTOSEL 6.12 439/486] drm/xe/relay: Don't use GFP_KERNEL for new transactions Sasha Levin
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:37 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Shuicheng Lin, Rodrigo Vivi, Sasha Levin, lucas.demarchi,
	thomas.hellstrom, airlied, simona, intel-xe, dri-devel

From: Shuicheng Lin <shuicheng.lin@intel.com>

[ Upstream commit b31e668d3111b100d16fd7db8db335328ce8c6d5 ]

xe_pm_runtime_put is missed in the failure path.

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Shuicheng Lin <shuicheng.lin@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250213230322.1180621-1-shuicheng.lin@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_debugfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index 051a37e477f4c..278ce1c37b395 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -156,6 +156,7 @@ static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
 		ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads);
 		if (ret) {
 			xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
+			xe_pm_runtime_put(xe);
 			return -EIO;
 		}
 	}
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 439/486] drm/xe/relay: Don't use GFP_KERNEL for new transactions
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (9 preceding siblings ...)
  2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 348/486] drm/xe/debugfs: Add missing xe_pm_runtime_put in wedge_mode_set Sasha Levin
@ 2025-05-05 22:38 ` Sasha Levin
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 469/486] drm/xe/pf: Reset GuC VF config when unprovisioning critical resource Sasha Levin
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:38 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Michal Wajdeczko, Marcin Bernatowicz, Sasha Levin, lucas.demarchi,
	thomas.hellstrom, rodrigo.vivi, airlied, simona, intel-xe,
	dri-devel

From: Michal Wajdeczko <michal.wajdeczko@intel.com>

[ Upstream commit 78d5d1e20d1de9422f013338a0f2311448588ba7 ]

VFs use a relay transaction during the resume/reset flow and use
of the GFP_KERNEL flag may conflict with the reclaim:

     -> #0 (fs_reclaim){+.+.}-{0:0}:
 [ ]        __lock_acquire+0x1874/0x2bc0
 [ ]        lock_acquire+0xd2/0x310
 [ ]        fs_reclaim_acquire+0xc5/0x100
 [ ]        mempool_alloc_noprof+0x5c/0x1b0
 [ ]        __relay_get_transaction+0xdc/0xa10 [xe]
 [ ]        relay_send_to+0x251/0xe50 [xe]
 [ ]        xe_guc_relay_send_to_pf+0x79/0x3a0 [xe]
 [ ]        xe_gt_sriov_vf_connect+0x90/0x4d0 [xe]
 [ ]        xe_uc_init_hw+0x157/0x3b0 [xe]
 [ ]        do_gt_restart+0x1ae/0x650 [xe]
 [ ]        xe_gt_resume+0xb6/0x120 [xe]
 [ ]        xe_pm_runtime_resume+0x15b/0x370 [xe]
 [ ]        xe_pci_runtime_resume+0x73/0x90 [xe]
 [ ]        pci_pm_runtime_resume+0xa0/0x100
 [ ]        __rpm_callback+0x4d/0x170
 [ ]        rpm_callback+0x64/0x70
 [ ]        rpm_resume+0x594/0x790
 [ ]        __pm_runtime_resume+0x4e/0x90
 [ ]        xe_pm_runtime_get_ioctl+0x9c/0x160 [xe]

Since we have a preallocated pool of relay transactions, which
should cover all our normal relay use cases, we may use the
GFP_NOWAIT flag when allocating new outgoing transactions.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Tested-by: Marcin Bernatowicz <marcin.bernatowicz@linux.intel.com>
Reviewed-by: Marcin Bernatowicz <marcin.bernatowicz@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250131153713.808-1-michal.wajdeczko@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_guc_relay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
index ade6162dc2598..0e5b43e1518ea 100644
--- a/drivers/gpu/drm/xe/xe_guc_relay.c
+++ b/drivers/gpu/drm/xe/xe_guc_relay.c
@@ -224,7 +224,7 @@ __relay_get_transaction(struct xe_guc_relay *relay, bool incoming, u32 remote, u
 	 * with CTB lock held which is marked as used in the reclaim path.
 	 * Btw, that's one of the reason why we use mempool here!
 	 */
-	txn = mempool_alloc(&relay->pool, incoming ? GFP_ATOMIC : GFP_KERNEL);
+	txn = mempool_alloc(&relay->pool, incoming ? GFP_ATOMIC : GFP_NOWAIT);
 	if (!txn)
 		return ERR_PTR(-ENOMEM);
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 469/486] drm/xe/pf: Reset GuC VF config when unprovisioning critical resource
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (10 preceding siblings ...)
  2025-05-05 22:38 ` [PATCH AUTOSEL 6.12 439/486] drm/xe/relay: Don't use GFP_KERNEL for new transactions Sasha Levin
@ 2025-05-05 22:39 ` Sasha Levin
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 477/486] drm/xe: Move suballocator init to after display init Sasha Levin
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:39 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Michal Wajdeczko, Michał Winiarski, Sasha Levin,
	lucas.demarchi, thomas.hellstrom, rodrigo.vivi, airlied, simona,
	intel-xe, dri-devel

From: Michal Wajdeczko <michal.wajdeczko@intel.com>

[ Upstream commit 33f17e2cbd930a2a00eb007d9b241b6db010a880 ]

GuC firmware counts received VF configuration KLVs and may start
validation of the complete VF config even if some resources where
unprovisioned in the meantime, leading to unexpected errors like:

 $ echo 1 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/contexts_quota
 $ echo 0 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/contexts_quota
 $ echo 1 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/doorbells_quota
 $ echo 0 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/doorbells_quota
 $ echo 1 | sudo tee /sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/ggtt_quota
 tee: '/sys/kernel/debug/dri/0000:00:02.0/gt0/vf1/ggtt_quota': Input/output error

To mitigate this problem trigger explicit VF config reset after
unprovisioning any of the critical resources (GGTT, context or
doorbell IDs) that GuC is monitoring.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250129195947.764-3-michal.wajdeczko@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 37 +++++++++++++++++++---
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
index c9ed996b9cb0c..786f0dba41437 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
@@ -323,6 +323,26 @@ static int pf_push_full_vf_config(struct xe_gt *gt, unsigned int vfid)
 	return err;
 }
 
+static int pf_push_vf_cfg(struct xe_gt *gt, unsigned int vfid, bool reset)
+{
+	int err = 0;
+
+	xe_gt_assert(gt, vfid);
+	lockdep_assert_held(xe_gt_sriov_pf_master_mutex(gt));
+
+	if (reset)
+		err = pf_send_vf_cfg_reset(gt, vfid);
+	if (!err)
+		err = pf_push_full_vf_config(gt, vfid);
+
+	return err;
+}
+
+static int pf_refresh_vf_cfg(struct xe_gt *gt, unsigned int vfid)
+{
+	return pf_push_vf_cfg(gt, vfid, true);
+}
+
 static u64 pf_get_ggtt_alignment(struct xe_gt *gt)
 {
 	struct xe_device *xe = gt_to_xe(gt);
@@ -419,6 +439,10 @@ static int pf_provision_vf_ggtt(struct xe_gt *gt, unsigned int vfid, u64 size)
 			return err;
 
 		pf_release_vf_config_ggtt(gt, config);
+
+		err = pf_refresh_vf_cfg(gt, vfid);
+		if (unlikely(err))
+			return err;
 	}
 	xe_gt_assert(gt, !xe_ggtt_node_allocated(config->ggtt_region));
 
@@ -744,6 +768,10 @@ static int pf_provision_vf_ctxs(struct xe_gt *gt, unsigned int vfid, u32 num_ctx
 			return ret;
 
 		pf_release_config_ctxs(gt, config);
+
+		ret = pf_refresh_vf_cfg(gt, vfid);
+		if (unlikely(ret))
+			return ret;
 	}
 
 	if (!num_ctxs)
@@ -1041,6 +1069,10 @@ static int pf_provision_vf_dbs(struct xe_gt *gt, unsigned int vfid, u32 num_dbs)
 			return ret;
 
 		pf_release_config_dbs(gt, config);
+
+		ret = pf_refresh_vf_cfg(gt, vfid);
+		if (unlikely(ret))
+			return ret;
 	}
 
 	if (!num_dbs)
@@ -2003,10 +2035,7 @@ int xe_gt_sriov_pf_config_push(struct xe_gt *gt, unsigned int vfid, bool refresh
 	xe_gt_assert(gt, vfid);
 
 	mutex_lock(xe_gt_sriov_pf_master_mutex(gt));
-	if (refresh)
-		err = pf_send_vf_cfg_reset(gt, vfid);
-	if (!err)
-		err = pf_push_full_vf_config(gt, vfid);
+	err = pf_push_vf_cfg(gt, vfid, refresh);
 	mutex_unlock(xe_gt_sriov_pf_master_mutex(gt));
 
 	if (unlikely(err)) {
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 477/486] drm/xe: Move suballocator init to after display init
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (11 preceding siblings ...)
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 469/486] drm/xe/pf: Reset GuC VF config when unprovisioning critical resource Sasha Levin
@ 2025-05-05 22:39 ` Sasha Levin
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 478/486] drm/xe: Do not attempt to bootstrap VF in execlists mode Sasha Levin
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:39 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maarten Lankhorst, Rodrigo Vivi, Sasha Levin, lucas.demarchi,
	thomas.hellstrom, airlied, simona, intel-xe, dri-devel

From: Maarten Lankhorst <dev@lankhorst.se>

[ Upstream commit 380b0cdaa76bc8f5c16db16eaf48751e792ff041 ]

No allocations should be done before we have had a chance to preserve
the display fb.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241210083111.230484-4-dev@lankhorst.se
Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_device.c |  6 ++++++
 drivers/gpu/drm/xe/xe_tile.c   | 12 ++++++++----
 drivers/gpu/drm/xe/xe_tile.h   |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 5c37bed3c948f..23e02372a49db 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -708,6 +708,12 @@ int xe_device_probe(struct xe_device *xe)
 	if (err)
 		goto err;
 
+	for_each_tile(tile, xe, id) {
+		err = xe_tile_init(tile);
+		if (err)
+			goto err;
+	}
+
 	for_each_gt(gt, xe, id) {
 		last_gt = id;
 
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index 349beddf9b383..36c87d7c72fbc 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -167,15 +167,19 @@ int xe_tile_init_noalloc(struct xe_tile *tile)
 	if (err)
 		return err;
 
-	tile->mem.kernel_bb_pool = xe_sa_bo_manager_init(tile, SZ_1M, 16);
-	if (IS_ERR(tile->mem.kernel_bb_pool))
-		return PTR_ERR(tile->mem.kernel_bb_pool);
-
 	xe_wa_apply_tile_workarounds(tile);
 
 	return xe_tile_sysfs_init(tile);
 }
 
+int xe_tile_init(struct xe_tile *tile)
+{
+	tile->mem.kernel_bb_pool = xe_sa_bo_manager_init(tile, SZ_1M, 16);
+	if (IS_ERR(tile->mem.kernel_bb_pool))
+		return PTR_ERR(tile->mem.kernel_bb_pool);
+
+	return 0;
+}
 void xe_tile_migrate_wait(struct xe_tile *tile)
 {
 	xe_migrate_wait(tile->migrate);
diff --git a/drivers/gpu/drm/xe/xe_tile.h b/drivers/gpu/drm/xe/xe_tile.h
index 1c9e42ade6b05..eb939316d55b0 100644
--- a/drivers/gpu/drm/xe/xe_tile.h
+++ b/drivers/gpu/drm/xe/xe_tile.h
@@ -12,6 +12,7 @@ struct xe_tile;
 
 int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id);
 int xe_tile_init_noalloc(struct xe_tile *tile);
+int xe_tile_init(struct xe_tile *tile);
 
 void xe_tile_migrate_wait(struct xe_tile *tile);
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 478/486] drm/xe: Do not attempt to bootstrap VF in execlists mode
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (12 preceding siblings ...)
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 477/486] drm/xe: Move suballocator init to after display init Sasha Levin
@ 2025-05-05 22:39 ` Sasha Levin
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 480/486] drm/xe/sa: Always call drm_suballoc_manager_fini() Sasha Levin
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 481/486] drm/xe: Reject BO eviction if BO is bound to current VM Sasha Levin
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:39 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maarten Lankhorst, Lucas De Marchi, Sasha Levin, thomas.hellstrom,
	rodrigo.vivi, airlied, simona, intel-xe, dri-devel

From: Maarten Lankhorst <dev@lankhorst.se>

[ Upstream commit f3b59457808f61d88178b0afa67cbd017d7ce79e ]

It was mentioned in a review that there is a possibility of choosing
to load the module with VF in execlists mode.

Of course this doesn't work, just bomb out as hard as possible.

Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241210083111.230484-12-dev@lankhorst.se
Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
index 7ddbfeaf494ac..29badbd829ab6 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c
@@ -235,6 +235,9 @@ int xe_gt_sriov_vf_bootstrap(struct xe_gt *gt)
 {
 	int err;
 
+	if (!xe_device_uc_enabled(gt_to_xe(gt)))
+		return -ENODEV;
+
 	err = vf_reset_guc_state(gt);
 	if (unlikely(err))
 		return err;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 480/486] drm/xe/sa: Always call drm_suballoc_manager_fini()
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (13 preceding siblings ...)
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 478/486] drm/xe: Do not attempt to bootstrap VF in execlists mode Sasha Levin
@ 2025-05-05 22:39 ` Sasha Levin
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 481/486] drm/xe: Reject BO eviction if BO is bound to current VM Sasha Levin
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:39 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Michal Wajdeczko, Matthew Brost, Sasha Levin, lucas.demarchi,
	thomas.hellstrom, rodrigo.vivi, airlied, simona, intel-xe,
	dri-devel

From: Michal Wajdeczko <michal.wajdeczko@intel.com>

[ Upstream commit 9cd3f4efc870463f17f6c29114c61fb6bfcaa291 ]

After successful call to drm_suballoc_manager_init() we should
make sure to call drm_suballoc_manager_fini() as it may include
some cleanup code even if we didn't start using it for real.

As we can abort init() early due to kvzalloc() failure, we should
either explicitly call drm_suballoc_manager_fini() or, even better,
postpone drm_suballoc_manager_init() once we finish all other
preparation steps, so we can rely on fini() that will do cleanup.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241220194205.995-2-michal.wajdeczko@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_sa.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_sa.c b/drivers/gpu/drm/xe/xe_sa.c
index fe2cb2a96f788..1d42547590067 100644
--- a/drivers/gpu/drm/xe/xe_sa.c
+++ b/drivers/gpu/drm/xe/xe_sa.c
@@ -57,8 +57,6 @@ struct xe_sa_manager *xe_sa_bo_manager_init(struct xe_tile *tile, u32 size, u32
 	}
 	sa_manager->bo = bo;
 	sa_manager->is_iomem = bo->vmap.is_iomem;
-
-	drm_suballoc_manager_init(&sa_manager->base, managed_size, align);
 	sa_manager->gpu_addr = xe_bo_ggtt_addr(bo);
 
 	if (bo->vmap.is_iomem) {
@@ -72,6 +70,7 @@ struct xe_sa_manager *xe_sa_bo_manager_init(struct xe_tile *tile, u32 size, u32
 		memset(sa_manager->cpu_ptr, 0, bo->ttm.base.size);
 	}
 
+	drm_suballoc_manager_init(&sa_manager->base, managed_size, align);
 	ret = drmm_add_action_or_reset(&xe->drm, xe_sa_bo_manager_fini,
 				       sa_manager);
 	if (ret)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH AUTOSEL 6.12 481/486] drm/xe: Reject BO eviction if BO is bound to current VM
       [not found] <20250505223922.2682012-1-sashal@kernel.org>
                   ` (14 preceding siblings ...)
  2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 480/486] drm/xe/sa: Always call drm_suballoc_manager_fini() Sasha Levin
@ 2025-05-05 22:39 ` Sasha Levin
  15 siblings, 0 replies; 16+ messages in thread
From: Sasha Levin @ 2025-05-05 22:39 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Oak Zeng, Thomas Hellström, Sasha Levin, lucas.demarchi,
	rodrigo.vivi, airlied, simona, intel-xe, dri-devel

From: Oak Zeng <oak.zeng@intel.com>

[ Upstream commit 0af944f0e3082ff517958b1cea76fb9b8cb379dd ]

This is a follow up fix for
https://patchwork.freedesktop.org/patch/msgid/20241203021929.1919730-1-oak.zeng@intel.com
The overall goal is to fail vm_bind when there is memory pressure. See more
details in the commit message of above patch. Abbove patch fixes the issue
when user pass in a vm_id parameter during gem_create. If user doesn't pass
in a vm_id during gem_create, above patch doesn't help.

This patch further reject BO eviction (which could be triggered by bo validation)
if BO is bound to the current VM. vm_bind could fail due to the eviction failure.
The BO to VM reverse mapping structure is used to determine whether BO is bound
to VM.

v2:
Move vm_bo definition from function scope to if(evict) clause (Thomas)
Further constraint the condition by adding ctx->resv (Thomas)
Add a short comment describe the change.

Suggested-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Oak Zeng <oak.zeng@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250110210137.3181576-1-oak.zeng@intel.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/gpu/drm/xe/xe_bo.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 35a8242a9f541..8acc4640f0a28 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -702,6 +702,21 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 		goto out;
 	}
 
+	/* Reject BO eviction if BO is bound to current VM. */
+	if (evict && ctx->resv) {
+		struct drm_gpuvm_bo *vm_bo;
+
+		drm_gem_for_each_gpuvm_bo(vm_bo, &bo->ttm.base) {
+			struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
+
+			if (xe_vm_resv(vm) == ctx->resv &&
+			    xe_vm_in_preempt_fence_mode(vm)) {
+				ret = -EBUSY;
+				goto out;
+			}
+		}
+	}
+
 	/*
 	 * Failed multi-hop where the old_mem is still marked as
 	 * TTM_PL_FLAG_TEMPORARY, should just be a dummy move.
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-05-05 22:56 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250505223922.2682012-1-sashal@kernel.org>
2025-05-05 22:34 ` [PATCH AUTOSEL 6.12 183/486] drm/xe: Nuke VM's mapping upon close Sasha Levin
2025-05-05 22:34 ` [PATCH AUTOSEL 6.12 184/486] drm/xe: Retry BO allocation Sasha Levin
2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 236/486] drm/xe/vf: Retry sending MMIO request to GUC on timeout error Sasha Levin
2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 237/486] drm/xe/pf: Create a link between PF and VF devices Sasha Levin
2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 242/486] drm/xe: xe_gen_wa_oob: replace program_invocation_short_name Sasha Levin
2025-05-05 22:35 ` [PATCH AUTOSEL 6.12 275/486] drm/xe/oa: Ensure that polled read returns latest data Sasha Levin
2025-05-05 22:36 ` [PATCH AUTOSEL 6.12 343/486] drm/xe: Stop ignoring errors from xe_ttm_stolen_mgr_init() Sasha Levin
2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 344/486] drm/xe: Fix xe_tile_init_noalloc() error propagation Sasha Levin
2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 347/486] drm/xe/debugfs: fixed the return value of wedged_mode_set Sasha Levin
2025-05-05 22:37 ` [PATCH AUTOSEL 6.12 348/486] drm/xe/debugfs: Add missing xe_pm_runtime_put in wedge_mode_set Sasha Levin
2025-05-05 22:38 ` [PATCH AUTOSEL 6.12 439/486] drm/xe/relay: Don't use GFP_KERNEL for new transactions Sasha Levin
2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 469/486] drm/xe/pf: Reset GuC VF config when unprovisioning critical resource Sasha Levin
2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 477/486] drm/xe: Move suballocator init to after display init Sasha Levin
2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 478/486] drm/xe: Do not attempt to bootstrap VF in execlists mode Sasha Levin
2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 480/486] drm/xe/sa: Always call drm_suballoc_manager_fini() Sasha Levin
2025-05-05 22:39 ` [PATCH AUTOSEL 6.12 481/486] drm/xe: Reject BO eviction if BO is bound to current VM Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox