[Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

Intel-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
@ 2023-06-07 19:03 Zhanjun Dong
  2023-06-07 20:17 ` John Harrison
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Zhanjun Dong @ 2023-06-07 19:03 UTC (permalink / raw)
  To: intel-gfx, dri-devel

This attempts to avoid circular locking dependency between flush delayed work and intel_gt_reset.
Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync version for reset path, it is safe as the worker has the trylock code to handle the lock; Meanwhile keep the sync version for park/fini to ensure the worker is not still running during suspend or shutdown.

WARNING: possible circular locking dependency detected
6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
------------------------------------------------------
kms_pipe_crc_ba/6415 is trying to acquire lock:
ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530

but task is already holding lock:
ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&gt->reset.mutex){+.+.}-{3:3}:
        lock_acquire+0xd8/0x2d0
        i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
        intel_gt_init_reset+0x65/0x80 [i915]
        intel_gt_common_init_early+0xe1/0x170 [i915]
        intel_root_gt_init_early+0x48/0x60 [i915]
        i915_driver_probe+0x671/0xcb0 [i915]
        i915_pci_probe+0xdc/0x210 [i915]
        pci_device_probe+0x95/0x120
        really_probe+0x164/0x3c0
        __driver_probe_device+0x73/0x160
        driver_probe_device+0x19/0xa0
        __driver_attach+0xb6/0x180
        bus_for_each_dev+0x77/0xd0
        bus_add_driver+0x114/0x210
        driver_register+0x5b/0x110
        __pfx_vgem_open+0x3/0x10 [vgem]
        do_one_initcall+0x57/0x270
        do_init_module+0x5f/0x220
        load_module+0x1ca4/0x1f00
        __do_sys_finit_module+0xb4/0x130
        do_syscall_64+0x3c/0x90
        entry_SYSCALL_64_after_hwframe+0x72/0xdc

-> #2 (fs_reclaim){+.+.}-{0:0}:
        lock_acquire+0xd8/0x2d0
        fs_reclaim_acquire+0xac/0xe0
        kmem_cache_alloc+0x32/0x260
        i915_vma_instance+0xb2/0xc60 [i915]
        i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
        vm_fault_gtt+0x22d/0xf60 [i915]
        __do_fault+0x2f/0x1d0
        do_pte_missing+0x4a/0xd20
        __handle_mm_fault+0x5b0/0x790
        handle_mm_fault+0xa2/0x230
        do_user_addr_fault+0x3ea/0xa10
        exc_page_fault+0x68/0x1a0
        asm_exc_page_fault+0x26/0x30

-> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
        lock_acquire+0xd8/0x2d0
        _intel_gt_reset_lock+0x57/0x330 [i915]
        guc_timestamp_ping+0x35/0x130 [i915]
        process_one_work+0x250/0x510
        worker_thread+0x4f/0x3a0
        kthread+0xff/0x130
        ret_from_fork+0x29/0x50

-> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
        check_prev_add+0x90/0xc60
        __lock_acquire+0x1998/0x2590
        lock_acquire+0xd8/0x2d0
        __flush_work+0x74/0x530
        __cancel_work_timer+0x14f/0x1f0
        intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
        intel_uc_reset_prepare+0x9c/0x120 [i915]
        reset_prepare+0x21/0x60 [i915]
        intel_gt_reset+0x1dd/0x470 [i915]
        intel_gt_reset_global+0xfb/0x170 [i915]
        intel_gt_handle_error+0x368/0x420 [i915]
        intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
        i915_wedged_set+0x29/0x40 [i915]
        simple_attr_write_xsigned.constprop.0+0xb4/0x110
        full_proxy_write+0x52/0x80
        vfs_write+0xc5/0x4f0
        ksys_write+0x64/0xe0
        do_syscall_64+0x3c/0x90
        entry_SYSCALL_64_after_hwframe+0x72/0xdc

other info that might help us debug this:
 Chain exists of:
  (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
  Possible unsafe locking scenario:
        CPU0                    CPU1
        ----                    ----
   lock(&gt->reset.mutex);
                                lock(fs_reclaim);
                                lock(&gt->reset.mutex);
   lock((work_completion)(&(&guc->timestamp.work)->work));

 *** DEADLOCK ***
 3 locks held by kms_pipe_crc_ba/6415:
  #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
  #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
  #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..cca6960d3490 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1357,9 +1357,12 @@ static void guc_enable_busyness_worker(struct intel_guc *guc)
 	mod_delayed_work(system_highpri_wq, &guc->timestamp.work, guc->timestamp.ping_delay);
 }
 
-static void guc_cancel_busyness_worker(struct intel_guc *guc)
+static void guc_cancel_busyness_worker(struct intel_guc *guc, bool sync)
 {
-	cancel_delayed_work_sync(&guc->timestamp.work);
+	if (sync)
+		cancel_delayed_work_sync(&guc->timestamp.work);
+	else
+		cancel_delayed_work(&guc->timestamp.work);
 }
 
 static void __reset_guc_busyness_stats(struct intel_guc *guc)
@@ -1370,7 +1373,7 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc)
 	unsigned long flags;
 	ktime_t unused;
 
-	guc_cancel_busyness_worker(guc);
+	guc_cancel_busyness_worker(guc, 0);
 
 	spin_lock_irqsave(&guc->timestamp.lock, flags);
 
@@ -1485,7 +1488,7 @@ static int guc_init_engine_stats(struct intel_guc *guc)
 
 static void guc_fini_engine_stats(struct intel_guc *guc)
 {
-	guc_cancel_busyness_worker(guc);
+	guc_cancel_busyness_worker(guc, 1);
 }
 
 void intel_guc_busyness_park(struct intel_gt *gt)
@@ -1500,7 +1503,7 @@ void intel_guc_busyness_park(struct intel_gt *gt)
 	 * and causes an unclaimed register access warning. Cancel the worker
 	 * synchronously here.
 	 */
-	guc_cancel_busyness_worker(guc);
+	guc_cancel_busyness_worker(guc, 1);
 
 	/*
 	 * Before parking, we should sample engine busyness stats if we need to.
@@ -4503,7 +4506,7 @@ int intel_guc_submission_enable(struct intel_guc *guc)
 /* Note: By the time we're here, GuC may have already been reset */
 void intel_guc_submission_disable(struct intel_guc *guc)
 {
-	guc_cancel_busyness_worker(guc);
+	guc_cancel_busyness_worker(guc, 0);
 
 	/* Semaphore interrupt disable and route to host */
 	guc_route_semaphores(guc, false);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-07 19:03 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
@ 2023-06-07 20:17 ` John Harrison
  2023-06-08 21:31   ` Dong, Zhanjun
  2023-06-08  0:19 ` Andi Shyti
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: John Harrison @ 2023-06-07 20:17 UTC (permalink / raw)
  To: Zhanjun Dong, intel-gfx, dri-devel, Umesh Nerlige Ramappa,
	Daniele Ceraolo Spurio

[-- Attachment #1: Type: text/plain, Size: 9637 bytes --]

On 6/7/2023 12:03, Zhanjun Dong wrote:
> This attempts to avoid circular locking dependency between flush delayed work and intel_gt_reset.
> Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync version for reset path, it is safe as the worker has the trylock code to handle the lock; Meanwhile keep the sync version for park/fini to ensure the worker is not still running during suspend or shutdown.
>
> WARNING: possible circular locking dependency detected
> 6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
> ------------------------------------------------------
> kms_pipe_crc_ba/6415 is trying to acquire lock:
> ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530
>
> but task is already holding lock:
> ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
>
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&gt->reset.mutex){+.+.}-{3:3}:
>          lock_acquire+0xd8/0x2d0
>          i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
>          intel_gt_init_reset+0x65/0x80 [i915]
>          intel_gt_common_init_early+0xe1/0x170 [i915]
>          intel_root_gt_init_early+0x48/0x60 [i915]
>          i915_driver_probe+0x671/0xcb0 [i915]
>          i915_pci_probe+0xdc/0x210 [i915]
>          pci_device_probe+0x95/0x120
>          really_probe+0x164/0x3c0
>          __driver_probe_device+0x73/0x160
>          driver_probe_device+0x19/0xa0
>          __driver_attach+0xb6/0x180
>          bus_for_each_dev+0x77/0xd0
>          bus_add_driver+0x114/0x210
>          driver_register+0x5b/0x110
>          __pfx_vgem_open+0x3/0x10 [vgem]
>          do_one_initcall+0x57/0x270
>          do_init_module+0x5f/0x220
>          load_module+0x1ca4/0x1f00
>          __do_sys_finit_module+0xb4/0x130
>          do_syscall_64+0x3c/0x90
>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> -> #2 (fs_reclaim){+.+.}-{0:0}:
>          lock_acquire+0xd8/0x2d0
>          fs_reclaim_acquire+0xac/0xe0
>          kmem_cache_alloc+0x32/0x260
>          i915_vma_instance+0xb2/0xc60 [i915]
>          i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
>          vm_fault_gtt+0x22d/0xf60 [i915]
>          __do_fault+0x2f/0x1d0
>          do_pte_missing+0x4a/0xd20
>          __handle_mm_fault+0x5b0/0x790
>          handle_mm_fault+0xa2/0x230
>          do_user_addr_fault+0x3ea/0xa10
>          exc_page_fault+0x68/0x1a0
>          asm_exc_page_fault+0x26/0x30
>
> -> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
>          lock_acquire+0xd8/0x2d0
>          _intel_gt_reset_lock+0x57/0x330 [i915]
>          guc_timestamp_ping+0x35/0x130 [i915]
>          process_one_work+0x250/0x510
>          worker_thread+0x4f/0x3a0
>          kthread+0xff/0x130
>          ret_from_fork+0x29/0x50
>
> -> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
>          check_prev_add+0x90/0xc60
>          __lock_acquire+0x1998/0x2590
>          lock_acquire+0xd8/0x2d0
>          __flush_work+0x74/0x530
>          __cancel_work_timer+0x14f/0x1f0
>          intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
>          intel_uc_reset_prepare+0x9c/0x120 [i915]
>          reset_prepare+0x21/0x60 [i915]
>          intel_gt_reset+0x1dd/0x470 [i915]
>          intel_gt_reset_global+0xfb/0x170 [i915]
>          intel_gt_handle_error+0x368/0x420 [i915]
>          intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
>          i915_wedged_set+0x29/0x40 [i915]
>          simple_attr_write_xsigned.constprop.0+0xb4/0x110
>          full_proxy_write+0x52/0x80
>          vfs_write+0xc5/0x4f0
>          ksys_write+0x64/0xe0
>          do_syscall_64+0x3c/0x90
>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> other info that might help us debug this:
>   Chain exists of:
>    (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
>    Possible unsafe locking scenario:
>          CPU0                    CPU1
>          ----                    ----
>     lock(&gt->reset.mutex);
>                                  lock(fs_reclaim);
>                                  lock(&gt->reset.mutex);
>     lock((work_completion)(&(&guc->timestamp.work)->work));
>
>   *** DEADLOCK ***
>   3 locks held by kms_pipe_crc_ba/6415:
>    #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
>    #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
>    #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
>
> Signed-off-by: Zhanjun Dong<zhanjun.dong@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 +++++++++------
>   1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index a0e3ef1c65d2..cca6960d3490 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1357,9 +1357,12 @@ static void guc_enable_busyness_worker(struct intel_guc *guc)
>   	mod_delayed_work(system_highpri_wq, &guc->timestamp.work, guc->timestamp.ping_delay);
>   }
>   
> -static void guc_cancel_busyness_worker(struct intel_guc *guc)
> +static void guc_cancel_busyness_worker(struct intel_guc *guc, bool sync)
>   {
> -	cancel_delayed_work_sync(&guc->timestamp.work);
> +	if (sync)
> +		cancel_delayed_work_sync(&guc->timestamp.work);
> +	else
> +		cancel_delayed_work(&guc->timestamp.work);
>   }
>   
>   static void __reset_guc_busyness_stats(struct intel_guc *guc)
> @@ -1370,7 +1373,7 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc)
>   	unsigned long flags;
>   	ktime_t unused;
>   
> -	guc_cancel_busyness_worker(guc);
> +	guc_cancel_busyness_worker(guc, 0);
Should use true/false rather than 1/0 for bool values.

Also, this needs a comment actually in the code, not just in the patch 
description. E.g.:

    Attempting a synchronous cancel within the reset path leads to a
    circular mutex locking complaint by lockdep. However, it is safe to
    use an asynchronous cancel here. If the worker does actually run
    concurrently with a reset then it will early exit due to the
    mutex_trylock call rather than block.


>   
>   	spin_lock_irqsave(&guc->timestamp.lock, flags);
>   
> @@ -1485,7 +1488,7 @@ static int guc_init_engine_stats(struct intel_guc *guc)
>   
>   static void guc_fini_engine_stats(struct intel_guc *guc)
>   {
> -	guc_cancel_busyness_worker(guc);
> +	guc_cancel_busyness_worker(guc, 1);
>   }
>   
>   void intel_guc_busyness_park(struct intel_gt *gt)
> @@ -1500,7 +1503,7 @@ void intel_guc_busyness_park(struct intel_gt *gt)
>   	 * and causes an unclaimed register access warning. Cancel the worker
>   	 * synchronously here.
>   	 */
> -	guc_cancel_busyness_worker(guc);
> +	guc_cancel_busyness_worker(guc, 1);
>   
>   	/*
>   	 * Before parking, we should sample engine busyness stats if we need to.
> @@ -4503,7 +4506,7 @@ int intel_guc_submission_enable(struct intel_guc *guc)
>   /* Note: By the time we're here, GuC may have already been reset */
>   void intel_guc_submission_disable(struct intel_guc *guc)
>   {
> -	guc_cancel_busyness_worker(guc);
> +	guc_cancel_busyness_worker(guc, 0);
Hmm. I think this is going to need breaking up further back in the stack.

We definitely want to be doing a synchronous cancel in the general case 
of disabling submission (e.g. due to suspend or driver unload). But if 
this is happening as part of a reset call stack, then it is a problem.

AFAICT, the only way _submission_disable would be called within a reset 
is if __uc_init_hw() failed for some reason. So one option would be to 
add the sync/async flag to _submission_disable() as well and just make 
the init failure case async with other callers being sync.

A better option might be to add an 'are busyness stats enabled' boolean 
to the guc structure. And inside the cancel function, early exit if the 
worker is not actually enabled (and set the flag anywhere and everywhere 
that does a mod_work to enable it, which is just 
guc_enable_busyness_worker() I think?). That would mean that multiple 
disables do nothing. So e.g. a failed reset will cancel the worker 
asynchronously in reset prepare but then not try to cancel it again 
synchronously in submission disable. Hmm, except that init_hw has 
already enabled the worker by that point :(.

FYC: Umesh and Daniele... any thoughts?

I would be tempted to say is there any way we can just add a lockdep 
annotation to ignore this issue? The lockdep splat described in the 
patch description above seems like a false positive to me. Sure the 
reset lock is held by the reset code which is now trying to 
synchronously flush the busyness worker thread which also takes the 
reset lock. But the busyness worker thread does a trylock and will abort 
if the lock is already held. So no issue...

However. I think we do have a genuine issue with the internal delayed 
worker lock itself, which has maybe shown up in other lockdep splat 
reports. For example, if a worker thread triggers a reset (e.g. anything 
reset related coming in via a G2H) then the reset code is running inside 
a worker thread. Which is maybe holding an internal kernel worker thread 
lock? So if the reset path does a synchronous cancel of another worker 
thread, that also requires taking the worker thread lock and thus a 
deadlock occurs.

John.


>   
>   	/* Semaphore interrupt disable and route to host */
>   	guc_route_semaphores(guc, false);

[-- Attachment #2: Type: text/html, Size: 10361 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-07 20:17 ` John Harrison
@ 2023-06-08 21:31   ` Dong, Zhanjun
  0 siblings, 0 replies; 12+ messages in thread
From: Dong, Zhanjun @ 2023-06-08 21:31 UTC (permalink / raw)
  To: Harrison, John C, intel-gfx@lists.freedesktop.org,
	dri-devel@lists.freedesktop.org, Nerlige Ramappa, Umesh,
	Ceraolo Spurio, Daniele

[-- Attachment #1: Type: text/plain, Size: 10628 bytes --]

John mentioned 2 options:

  1.  “add the sync/async flag to _submission_disable()”

Consider to be a small change

  1.  add an 'are busyness stats enabled' boolean to the guc structure

Seems effected area among the flow and is much more than option 1.

I would like to discuss a bit more before moving forward.

Regards,
Zhanjun Dong

From: Harrison, John C <john.c.harrison@intel.com>
Sent: June 7, 2023 4:17 PM
To: Dong, Zhanjun <zhanjun.dong@intel.com>; intel-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org; Nerlige Ramappa, Umesh <umesh.nerlige.ramappa@intel.com>; Ceraolo Spurio, Daniele <daniele.ceraolospurio@intel.com>
Subject: Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

On 6/7/2023 12:03, Zhanjun Dong wrote:


This attempts to avoid circular locking dependency between flush delayed work and intel_gt_reset.

Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync version for reset path, it is safe as the worker has the trylock code to handle the lock; Meanwhile keep the sync version for park/fini to ensure the worker is not still running during suspend or shutdown.



WARNING: possible circular locking dependency detected

6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted

------------------------------------------------------

kms_pipe_crc_ba/6415 is trying to acquire lock:

ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530



but task is already holding lock:

ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]



which lock already depends on the new lock.



the existing dependency chain (in reverse order) is:



-> #3 (&gt->reset.mutex){+.+.}-{3:3}:

        lock_acquire+0xd8/0x2d0

        i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]

        intel_gt_init_reset+0x65/0x80 [i915]

        intel_gt_common_init_early+0xe1/0x170 [i915]

        intel_root_gt_init_early+0x48/0x60 [i915]

        i915_driver_probe+0x671/0xcb0 [i915]

        i915_pci_probe+0xdc/0x210 [i915]

        pci_device_probe+0x95/0x120

        really_probe+0x164/0x3c0

        __driver_probe_device+0x73/0x160

        driver_probe_device+0x19/0xa0

        __driver_attach+0xb6/0x180

        bus_for_each_dev+0x77/0xd0

        bus_add_driver+0x114/0x210

        driver_register+0x5b/0x110

        __pfx_vgem_open+0x3/0x10 [vgem]

        do_one_initcall+0x57/0x270

        do_init_module+0x5f/0x220

        load_module+0x1ca4/0x1f00

        __do_sys_finit_module+0xb4/0x130

        do_syscall_64+0x3c/0x90

        entry_SYSCALL_64_after_hwframe+0x72/0xdc



-> #2 (fs_reclaim){+.+.}-{0:0}:

        lock_acquire+0xd8/0x2d0

        fs_reclaim_acquire+0xac/0xe0

        kmem_cache_alloc+0x32/0x260

        i915_vma_instance+0xb2/0xc60 [i915]

        i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]

        vm_fault_gtt+0x22d/0xf60 [i915]

        __do_fault+0x2f/0x1d0

        do_pte_missing+0x4a/0xd20

        __handle_mm_fault+0x5b0/0x790

        handle_mm_fault+0xa2/0x230

        do_user_addr_fault+0x3ea/0xa10

        exc_page_fault+0x68/0x1a0

        asm_exc_page_fault+0x26/0x30



-> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:

        lock_acquire+0xd8/0x2d0

        _intel_gt_reset_lock+0x57/0x330 [i915]

        guc_timestamp_ping+0x35/0x130 [i915]

        process_one_work+0x250/0x510

        worker_thread+0x4f/0x3a0

        kthread+0xff/0x130

        ret_from_fork+0x29/0x50



-> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:

        check_prev_add+0x90/0xc60

        __lock_acquire+0x1998/0x2590

        lock_acquire+0xd8/0x2d0

        __flush_work+0x74/0x530

        __cancel_work_timer+0x14f/0x1f0

        intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]

        intel_uc_reset_prepare+0x9c/0x120 [i915]

        reset_prepare+0x21/0x60 [i915]

        intel_gt_reset+0x1dd/0x470 [i915]

        intel_gt_reset_global+0xfb/0x170 [i915]

        intel_gt_handle_error+0x368/0x420 [i915]

        intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]

        i915_wedged_set+0x29/0x40 [i915]

        simple_attr_write_xsigned.constprop.0+0xb4/0x110

        full_proxy_write+0x52/0x80

        vfs_write+0xc5/0x4f0

        ksys_write+0x64/0xe0

        do_syscall_64+0x3c/0x90

        entry_SYSCALL_64_after_hwframe+0x72/0xdc



other info that might help us debug this:

 Chain exists of:

  (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex

  Possible unsafe locking scenario:

        CPU0                    CPU1

        ----                    ----

   lock(&gt->reset.mutex);

                                lock(fs_reclaim);

                                lock(&gt->reset.mutex);

   lock((work_completion)(&(&guc->timestamp.work)->work));



 *** DEADLOCK ***

 3 locks held by kms_pipe_crc_ba/6415:

  #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0

  #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110

  #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]



Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com><mailto:zhanjun.dong@intel.com>

---

 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 +++++++++------

 1 file changed, 9 insertions(+), 6 deletions(-)



diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index a0e3ef1c65d2..cca6960d3490 100644

--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

@@ -1357,9 +1357,12 @@ static void guc_enable_busyness_worker(struct intel_guc *guc)

        mod_delayed_work(system_highpri_wq, &guc->timestamp.work, guc->timestamp.ping_delay);

 }



-static void guc_cancel_busyness_worker(struct intel_guc *guc)

+static void guc_cancel_busyness_worker(struct intel_guc *guc, bool sync)

 {

- cancel_delayed_work_sync(&guc->timestamp.work);

+       if (sync)

+        cancel_delayed_work_sync(&guc->timestamp.work);

+       else

+        cancel_delayed_work(&guc->timestamp.work);

 }



 static void __reset_guc_busyness_stats(struct intel_guc *guc)

@@ -1370,7 +1373,7 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc)

        unsigned long flags;

        ktime_t unused;



-       guc_cancel_busyness_worker(guc);

+       guc_cancel_busyness_worker(guc, 0);
Should use true/false rather than 1/0 for bool values.

Also, this needs a comment actually in the code, not just in the patch description. E.g.:
Attempting a synchronous cancel within the reset path leads to a circular mutex locking complaint by lockdep. However, it is safe to use an asynchronous cancel here. If the worker does actually run concurrently with a reset then it will early exit due to the mutex_trylock call rather than block.







  spin_lock_irqsave(&guc->timestamp.lock, flags);



@@ -1485,7 +1488,7 @@ static int guc_init_engine_stats(struct intel_guc *guc)



 static void guc_fini_engine_stats(struct intel_guc *guc)

 {

-       guc_cancel_busyness_worker(guc);

+       guc_cancel_busyness_worker(guc, 1);

 }



 void intel_guc_busyness_park(struct intel_gt *gt)

@@ -1500,7 +1503,7 @@ void intel_guc_busyness_park(struct intel_gt *gt)

         * and causes an unclaimed register access warning. Cancel the worker

         * synchronously here.

         */

-       guc_cancel_busyness_worker(guc);

+       guc_cancel_busyness_worker(guc, 1);



        /*

         * Before parking, we should sample engine busyness stats if we need to.

@@ -4503,7 +4506,7 @@ int intel_guc_submission_enable(struct intel_guc *guc)

 /* Note: By the time we're here, GuC may have already been reset */

 void intel_guc_submission_disable(struct intel_guc *guc)

 {

-       guc_cancel_busyness_worker(guc);

+       guc_cancel_busyness_worker(guc, 0);
Hmm. I think this is going to need breaking up further back in the stack.

We definitely want to be doing a synchronous cancel in the general case of disabling submission (e.g. due to suspend or driver unload). But if this is happening as part of a reset call stack, then it is a problem.

AFAICT, the only way _submission_disable would be called within a reset is if __uc_init_hw() failed for some reason. So one option would be to add the sync/async flag to _submission_disable() as well and just make the init failure case async with other callers being sync.

A better option might be to add an 'are busyness stats enabled' boolean to the guc structure. And inside the cancel function, early exit if the worker is not actually enabled (and set the flag anywhere and everywhere that does a mod_work to enable it, which is just guc_enable_busyness_worker() I think?). That would mean that multiple disables do nothing. So e.g. a failed reset will cancel the worker asynchronously in reset prepare but then not try to cancel it again synchronously in submission disable. Hmm, except that init_hw has already enabled the worker by that point :(.

FYC: Umesh and Daniele... any thoughts?

I would be tempted to say is there any way we can just add a lockdep annotation to ignore this issue? The lockdep splat described in the patch description above seems like a false positive to me. Sure the reset lock is held by the reset code which is now trying to synchronously flush the busyness worker thread which also takes the reset lock. But the busyness worker thread does a trylock and will abort if the lock is already held. So no issue...

However. I think we do have a genuine issue with the internal delayed worker lock itself, which has maybe shown up in other lockdep splat reports. For example, if a worker thread triggers a reset (e.g. anything reset related coming in via a G2H) then the reset code is running inside a worker thread. Which is maybe holding an internal kernel worker thread lock? So if the reset path does a synchronous cancel of another worker thread, that also requires taking the worker thread lock and thus a deadlock occurs.

John.








        /* Semaphore interrupt disable and route to host */

        guc_route_semaphores(guc, false);


[-- Attachment #2: Type: text/html, Size: 22985 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-07 19:03 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
  2023-06-07 20:17 ` John Harrison
@ 2023-06-08  0:19 ` Andi Shyti
  2023-06-08 21:17   ` Dong, Zhanjun
  2023-06-08  2:34 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2) Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Andi Shyti @ 2023-06-08  0:19 UTC (permalink / raw)
  To: Zhanjun Dong; +Cc: intel-gfx, dri-devel

Hi Dong,

On Wed, Jun 07, 2023 at 12:03:50PM -0700, Zhanjun Dong wrote:
> This attempts to avoid circular locking dependency between flush delayed work and intel_gt_reset.
> Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync version for reset path, it is safe as the worker has the trylock code to handle the lock; Meanwhile keep the sync version for park/fini to ensure the worker is not still running during suspend or shutdown.
> 
> WARNING: possible circular locking dependency detected
> 6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
> ------------------------------------------------------
> kms_pipe_crc_ba/6415 is trying to acquire lock:
> ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530
> 
> but task is already holding lock:
> ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
> 
> which lock already depends on the new lock.
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #3 (&gt->reset.mutex){+.+.}-{3:3}:
>         lock_acquire+0xd8/0x2d0
>         i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
>         intel_gt_init_reset+0x65/0x80 [i915]
>         intel_gt_common_init_early+0xe1/0x170 [i915]
>         intel_root_gt_init_early+0x48/0x60 [i915]
>         i915_driver_probe+0x671/0xcb0 [i915]
>         i915_pci_probe+0xdc/0x210 [i915]
>         pci_device_probe+0x95/0x120
>         really_probe+0x164/0x3c0
>         __driver_probe_device+0x73/0x160
>         driver_probe_device+0x19/0xa0
>         __driver_attach+0xb6/0x180
>         bus_for_each_dev+0x77/0xd0
>         bus_add_driver+0x114/0x210
>         driver_register+0x5b/0x110
>         __pfx_vgem_open+0x3/0x10 [vgem]
>         do_one_initcall+0x57/0x270
>         do_init_module+0x5f/0x220
>         load_module+0x1ca4/0x1f00
>         __do_sys_finit_module+0xb4/0x130
>         do_syscall_64+0x3c/0x90
>         entry_SYSCALL_64_after_hwframe+0x72/0xdc
> 
> -> #2 (fs_reclaim){+.+.}-{0:0}:
>         lock_acquire+0xd8/0x2d0
>         fs_reclaim_acquire+0xac/0xe0
>         kmem_cache_alloc+0x32/0x260
>         i915_vma_instance+0xb2/0xc60 [i915]
>         i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
>         vm_fault_gtt+0x22d/0xf60 [i915]
>         __do_fault+0x2f/0x1d0
>         do_pte_missing+0x4a/0xd20
>         __handle_mm_fault+0x5b0/0x790
>         handle_mm_fault+0xa2/0x230
>         do_user_addr_fault+0x3ea/0xa10
>         exc_page_fault+0x68/0x1a0
>         asm_exc_page_fault+0x26/0x30
> 
> -> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
>         lock_acquire+0xd8/0x2d0
>         _intel_gt_reset_lock+0x57/0x330 [i915]
>         guc_timestamp_ping+0x35/0x130 [i915]
>         process_one_work+0x250/0x510
>         worker_thread+0x4f/0x3a0
>         kthread+0xff/0x130
>         ret_from_fork+0x29/0x50
> 
> -> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
>         check_prev_add+0x90/0xc60
>         __lock_acquire+0x1998/0x2590
>         lock_acquire+0xd8/0x2d0
>         __flush_work+0x74/0x530
>         __cancel_work_timer+0x14f/0x1f0
>         intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
>         intel_uc_reset_prepare+0x9c/0x120 [i915]
>         reset_prepare+0x21/0x60 [i915]
>         intel_gt_reset+0x1dd/0x470 [i915]
>         intel_gt_reset_global+0xfb/0x170 [i915]
>         intel_gt_handle_error+0x368/0x420 [i915]
>         intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
>         i915_wedged_set+0x29/0x40 [i915]
>         simple_attr_write_xsigned.constprop.0+0xb4/0x110
>         full_proxy_write+0x52/0x80
>         vfs_write+0xc5/0x4f0
>         ksys_write+0x64/0xe0
>         do_syscall_64+0x3c/0x90
>         entry_SYSCALL_64_after_hwframe+0x72/0xdc
> 
> other info that might help us debug this:
>  Chain exists of:
>   (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
>   Possible unsafe locking scenario:
>         CPU0                    CPU1
>         ----                    ----
>    lock(&gt->reset.mutex);
>                                 lock(fs_reclaim);
>                                 lock(&gt->reset.mutex);
>    lock((work_completion)(&(&guc->timestamp.work)->work));
> 
>  *** DEADLOCK ***
>  3 locks held by kms_pipe_crc_ba/6415:
>   #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
>   #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
>   #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
> 
> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>

Andrzej's r-b is missing here.

> ---

Please add a version to your patch and a changelog.

Thanks,
Andi

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-08  0:19 ` Andi Shyti
@ 2023-06-08 21:17   ` Dong, Zhanjun
  0 siblings, 0 replies; 12+ messages in thread
From: Dong, Zhanjun @ 2023-06-08 21:17 UTC (permalink / raw)
  To: Andi Shyti; +Cc: intel-gfx, dri-devel

Hi Andi,

Thanks for comments. Info would be updated on next revision, which is on 
the way.

Regards,

Zhanjun Dong

On 2023-06-07 8:19 p.m., Andi Shyti wrote:
> Hi Dong,
>
> On Wed, Jun 07, 2023 at 12:03:50PM -0700, Zhanjun Dong wrote:
>> This attempts to avoid circular locking dependency between flush delayed work and intel_gt_reset.
>> Switched from cancel_delayed_work_sync to cancel_delayed_work, the non-sync version for reset path, it is safe as the worker has the trylock code to handle the lock; Meanwhile keep the sync version for park/fini to ensure the worker is not still running during suspend or shutdown.
>>
>> WARNING: possible circular locking dependency detected
>> 6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
>> ------------------------------------------------------
>> kms_pipe_crc_ba/6415 is trying to acquire lock:
>> ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530
>>
>> but task is already holding lock:
>> ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
>>
>> which lock already depends on the new lock.
>>
>> the existing dependency chain (in reverse order) is:
>>
>> -> #3 (&gt->reset.mutex){+.+.}-{3:3}:
>>          lock_acquire+0xd8/0x2d0
>>          i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
>>          intel_gt_init_reset+0x65/0x80 [i915]
>>          intel_gt_common_init_early+0xe1/0x170 [i915]
>>          intel_root_gt_init_early+0x48/0x60 [i915]
>>          i915_driver_probe+0x671/0xcb0 [i915]
>>          i915_pci_probe+0xdc/0x210 [i915]
>>          pci_device_probe+0x95/0x120
>>          really_probe+0x164/0x3c0
>>          __driver_probe_device+0x73/0x160
>>          driver_probe_device+0x19/0xa0
>>          __driver_attach+0xb6/0x180
>>          bus_for_each_dev+0x77/0xd0
>>          bus_add_driver+0x114/0x210
>>          driver_register+0x5b/0x110
>>          __pfx_vgem_open+0x3/0x10 [vgem]
>>          do_one_initcall+0x57/0x270
>>          do_init_module+0x5f/0x220
>>          load_module+0x1ca4/0x1f00
>>          __do_sys_finit_module+0xb4/0x130
>>          do_syscall_64+0x3c/0x90
>>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>
>> -> #2 (fs_reclaim){+.+.}-{0:0}:
>>          lock_acquire+0xd8/0x2d0
>>          fs_reclaim_acquire+0xac/0xe0
>>          kmem_cache_alloc+0x32/0x260
>>          i915_vma_instance+0xb2/0xc60 [i915]
>>          i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
>>          vm_fault_gtt+0x22d/0xf60 [i915]
>>          __do_fault+0x2f/0x1d0
>>          do_pte_missing+0x4a/0xd20
>>          __handle_mm_fault+0x5b0/0x790
>>          handle_mm_fault+0xa2/0x230
>>          do_user_addr_fault+0x3ea/0xa10
>>          exc_page_fault+0x68/0x1a0
>>          asm_exc_page_fault+0x26/0x30
>>
>> -> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
>>          lock_acquire+0xd8/0x2d0
>>          _intel_gt_reset_lock+0x57/0x330 [i915]
>>          guc_timestamp_ping+0x35/0x130 [i915]
>>          process_one_work+0x250/0x510
>>          worker_thread+0x4f/0x3a0
>>          kthread+0xff/0x130
>>          ret_from_fork+0x29/0x50
>>
>> -> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
>>          check_prev_add+0x90/0xc60
>>          __lock_acquire+0x1998/0x2590
>>          lock_acquire+0xd8/0x2d0
>>          __flush_work+0x74/0x530
>>          __cancel_work_timer+0x14f/0x1f0
>>          intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
>>          intel_uc_reset_prepare+0x9c/0x120 [i915]
>>          reset_prepare+0x21/0x60 [i915]
>>          intel_gt_reset+0x1dd/0x470 [i915]
>>          intel_gt_reset_global+0xfb/0x170 [i915]
>>          intel_gt_handle_error+0x368/0x420 [i915]
>>          intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
>>          i915_wedged_set+0x29/0x40 [i915]
>>          simple_attr_write_xsigned.constprop.0+0xb4/0x110
>>          full_proxy_write+0x52/0x80
>>          vfs_write+0xc5/0x4f0
>>          ksys_write+0x64/0xe0
>>          do_syscall_64+0x3c/0x90
>>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>
>> other info that might help us debug this:
>>   Chain exists of:
>>    (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
>>    Possible unsafe locking scenario:
>>          CPU0                    CPU1
>>          ----                    ----
>>     lock(&gt->reset.mutex);
>>                                  lock(fs_reclaim);
>>                                  lock(&gt->reset.mutex);
>>     lock((work_completion)(&(&guc->timestamp.work)->work));
>>
>>   *** DEADLOCK ***
>>   3 locks held by kms_pipe_crc_ba/6415:
>>    #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
>>    #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
>>    #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
> Andrzej's r-b is missing here.
>
>> ---
> Please add a version to your patch and a changelog.
>
> Thanks,
> Andi

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2)
  2023-06-07 19:03 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
  2023-06-07 20:17 ` John Harrison
  2023-06-08  0:19 ` Andi Shyti
@ 2023-06-08  2:34 ` Patchwork
  2023-06-08  2:57 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
  2023-06-08 22:01 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork
  4 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2023-06-08  2:34 UTC (permalink / raw)
  To: Zhanjun Dong; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2)
URL   : https://patchwork.freedesktop.org/series/118898/
State : warning

== Summary ==

Error: dim checkpatch failed
e1bc5c0eba1a drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
-:7: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#7: 
This attempts to avoid circular locking dependency between flush delayed work and intel_gt_reset.

total: 0 errors, 1 warnings, 0 checks, 46 lines checked



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2)
  2023-06-07 19:03 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
                   ` (2 preceding siblings ...)
  2023-06-08  2:34 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2) Patchwork
@ 2023-06-08  2:57 ` Patchwork
  2023-06-08 22:01 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork
  4 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2023-06-08  2:57 UTC (permalink / raw)
  To: Zhanjun Dong; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 7339 bytes --]

== Series Details ==

Series: drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2)
URL   : https://patchwork.freedesktop.org/series/118898/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_13246 -> Patchwork_118898v2
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/index.html

Participating hosts (37 -> 35)
------------------------------

  Missing    (2): fi-tgl-1115g4 fi-snb-2520m 

Known issues
------------

  Here are the changes found in Patchwork_118898v2 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_lmem_swapping@basic:
    - bat-adln-1:         NOTRUN -> [SKIP][1] ([i915#4613]) +3 similar issues
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-adln-1/igt@gem_lmem_swapping@basic.html

  * igt@i915_pm_rps@basic-api:
    - bat-adln-1:         NOTRUN -> [SKIP][2] ([i915#6621])
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-adln-1/igt@i915_pm_rps@basic-api.html

  * igt@i915_selftest@live@hangcheck:
    - bat-dg2-11:         [PASS][3] -> [ABORT][4] ([i915#7913] / [i915#7979])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/bat-dg2-11/igt@i915_selftest@live@hangcheck.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-dg2-11/igt@i915_selftest@live@hangcheck.html

  * igt@i915_selftest@live@slpc:
    - bat-rpls-2:         NOTRUN -> [DMESG-WARN][5] ([i915#6367])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-rpls-2/igt@i915_selftest@live@slpc.html

  * igt@i915_suspend@basic-s2idle-without-i915:
    - bat-rpls-2:         NOTRUN -> [ABORT][6] ([i915#6687])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-rpls-2/igt@i915_suspend@basic-s2idle-without-i915.html

  * igt@i915_suspend@basic-s3-without-i915:
    - bat-rpls-1:         NOTRUN -> [ABORT][7] ([i915#6687] / [i915#7978])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-rpls-1/igt@i915_suspend@basic-s3-without-i915.html

  * igt@kms_chamelium_hpd@common-hpd-after-suspend:
    - bat-adln-1:         NOTRUN -> [SKIP][8] ([i915#7828])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-adln-1/igt@kms_chamelium_hpd@common-hpd-after-suspend.html

  * igt@kms_pipe_crc_basic@nonblocking-crc-frame-sequence:
    - bat-dg2-11:         NOTRUN -> [SKIP][9] ([i915#1845] / [i915#5354]) +2 similar issues
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-dg2-11/igt@kms_pipe_crc_basic@nonblocking-crc-frame-sequence.html

  * igt@kms_setmode@basic-clone-single-crtc:
    - bat-adln-1:         NOTRUN -> [SKIP][10] ([i915#3555] / [i915#4579])
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-adln-1/igt@kms_setmode@basic-clone-single-crtc.html

  * igt@prime_vgem@basic-write:
    - bat-adln-1:         NOTRUN -> [SKIP][11] ([fdo#109295] / [i915#3291]) +2 similar issues
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-adln-1/igt@prime_vgem@basic-write.html

  
#### Possible fixes ####

  * igt@i915_selftest@live@migrate:
    - bat-dg2-11:         [DMESG-WARN][12] ([i915#7699]) -> [PASS][13]
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/bat-dg2-11/igt@i915_selftest@live@migrate.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-dg2-11/igt@i915_selftest@live@migrate.html

  * igt@i915_selftest@live@requests:
    - bat-rpls-2:         [ABORT][14] ([i915#4983] / [i915#7913]) -> [PASS][15]
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/bat-rpls-2/igt@i915_selftest@live@requests.html
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-rpls-2/igt@i915_selftest@live@requests.html
    - bat-rpls-1:         [ABORT][16] ([i915#7911] / [i915#7920] / [i915#7982]) -> [PASS][17]
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/bat-rpls-1/igt@i915_selftest@live@requests.html
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-rpls-1/igt@i915_selftest@live@requests.html

  * {igt@kms_pipe_crc_basic@compare-crc-sanitycheck-xr24@pipe-b-dp-1}:
    - bat-dg2-8:          [FAIL][18] -> [PASS][19]
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/bat-dg2-8/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-xr24@pipe-b-dp-1.html
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-dg2-8/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-xr24@pipe-b-dp-1.html

  * igt@kms_psr@sprite_plane_onoff:
    - bat-adln-1:         [ABORT][20] ([i915#8442]) -> [PASS][21]
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/bat-adln-1/igt@kms_psr@sprite_plane_onoff.html
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/bat-adln-1/igt@kms_psr@sprite_plane_onoff.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109295]: https://bugs.freedesktop.org/show_bug.cgi?id=109295
  [i915#1845]: https://gitlab.freedesktop.org/drm/intel/issues/1845
  [i915#3291]: https://gitlab.freedesktop.org/drm/intel/issues/3291
  [i915#3555]: https://gitlab.freedesktop.org/drm/intel/issues/3555
  [i915#4423]: https://gitlab.freedesktop.org/drm/intel/issues/4423
  [i915#4579]: https://gitlab.freedesktop.org/drm/intel/issues/4579
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4983]: https://gitlab.freedesktop.org/drm/intel/issues/4983
  [i915#5354]: https://gitlab.freedesktop.org/drm/intel/issues/5354
  [i915#6367]: https://gitlab.freedesktop.org/drm/intel/issues/6367
  [i915#6621]: https://gitlab.freedesktop.org/drm/intel/issues/6621
  [i915#6687]: https://gitlab.freedesktop.org/drm/intel/issues/6687
  [i915#6763]: https://gitlab.freedesktop.org/drm/intel/issues/6763
  [i915#7059]: https://gitlab.freedesktop.org/drm/intel/issues/7059
  [i915#7699]: https://gitlab.freedesktop.org/drm/intel/issues/7699
  [i915#7828]: https://gitlab.freedesktop.org/drm/intel/issues/7828
  [i915#7911]: https://gitlab.freedesktop.org/drm/intel/issues/7911
  [i915#7913]: https://gitlab.freedesktop.org/drm/intel/issues/7913
  [i915#7920]: https://gitlab.freedesktop.org/drm/intel/issues/7920
  [i915#7978]: https://gitlab.freedesktop.org/drm/intel/issues/7978
  [i915#7979]: https://gitlab.freedesktop.org/drm/intel/issues/7979
  [i915#7982]: https://gitlab.freedesktop.org/drm/intel/issues/7982
  [i915#8442]: https://gitlab.freedesktop.org/drm/intel/issues/8442


Build changes
-------------

  * Linux: CI_DRM_13246 -> Patchwork_118898v2

  CI-20190529: 20190529
  CI_DRM_13246: 140b8a06031370fc0cf0ef5a5ca8cca0a4083951 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_7321: f52cfd53f353fdaca537c810fbc35e09ffd07345 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_118898v2: 140b8a06031370fc0cf0ef5a5ca8cca0a4083951 @ git://anongit.freedesktop.org/gfx-ci/linux


### Linux commits

e1a065814e32 drm/i915: Avoid circular locking dependency when flush delayed work on gt reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/index.html

[-- Attachment #2: Type: text/html, Size: 8236 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Intel-gfx] ✗ Fi.CI.IGT: failure for drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2)
  2023-06-07 19:03 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
                   ` (3 preceding siblings ...)
  2023-06-08  2:57 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
@ 2023-06-08 22:01 ` Patchwork
  4 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2023-06-08 22:01 UTC (permalink / raw)
  To: Dong, Zhanjun; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 16248 bytes --]

== Series Details ==

Series: drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2)
URL   : https://patchwork.freedesktop.org/series/118898/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_13246_full -> Patchwork_118898v2_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_118898v2_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_118898v2_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Participating hosts (7 -> 7)
------------------------------

  No changes in participating hosts

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_118898v2_full:

### IGT changes ###

#### Possible regressions ####

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic:
    - shard-snb:          [PASS][1] -> [FAIL][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-snb5/igt@kms_cursor_legacy@flip-vs-cursor-atomic.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-snb2/igt@kms_cursor_legacy@flip-vs-cursor-atomic.html

  
Known issues
------------

  Here are the changes found in Patchwork_118898v2_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_eio@in-flight-suspend:
    - shard-apl:          [PASS][3] -> [ABORT][4] ([i915#180])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-apl1/igt@gem_eio@in-flight-suspend.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-apl1/igt@gem_eio@in-flight-suspend.html

  * igt@gem_lmem_swapping@random:
    - shard-glk:          NOTRUN -> [SKIP][5] ([fdo#109271] / [i915#4613])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@gem_lmem_swapping@random.html

  * igt@gem_userptr_blits@dmabuf-sync:
    - shard-glk:          NOTRUN -> [SKIP][6] ([fdo#109271] / [i915#3323])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@gem_userptr_blits@dmabuf-sync.html

  * igt@i915_selftest@mock@sanitycheck:
    - shard-snb:          [PASS][7] -> [ABORT][8] ([i915#4528] / [i915#4579])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-snb6/igt@i915_selftest@mock@sanitycheck.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-snb5/igt@i915_selftest@mock@sanitycheck.html

  * igt@kms_ccs@pipe-c-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs:
    - shard-glk:          NOTRUN -> [SKIP][9] ([fdo#109271] / [i915#3886]) +2 similar issues
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk3/igt@kms_ccs@pipe-c-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs.html

  * igt@kms_chamelium_color@ctm-blue-to-red:
    - shard-glk:          NOTRUN -> [SKIP][10] ([fdo#109271]) +31 similar issues
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@kms_chamelium_color@ctm-blue-to-red.html

  * igt@kms_cursor_crc@cursor-random-32x32:
    - shard-glk:          NOTRUN -> [SKIP][11] ([fdo#109271] / [i915#4579]) +2 similar issues
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@kms_cursor_crc@cursor-random-32x32.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size:
    - shard-apl:          [PASS][12] -> [FAIL][13] ([i915#2346])
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-apl1/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-apl1/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html
    - shard-glk:          [PASS][14] -> [FAIL][15] ([i915#2346])
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-glk7/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk1/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html

  * igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1:
    - shard-apl:          [PASS][16] -> [FAIL][17] ([i915#1188])
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-apl3/igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1.html
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-apl1/igt@kms_hdr@bpc-switch-dpms@pipe-a-dp-1.html

  * igt@kms_plane_scaling@plane-upscale-with-modifiers-20x20@pipe-b-vga-1:
    - shard-snb:          NOTRUN -> [SKIP][18] ([fdo#109271] / [i915#4579]) +9 similar issues
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-snb5/igt@kms_plane_scaling@plane-upscale-with-modifiers-20x20@pipe-b-vga-1.html

  * igt@kms_plane_scaling@plane-upscale-with-modifiers-factor-0-25@pipe-a-vga-1:
    - shard-snb:          NOTRUN -> [SKIP][19] ([fdo#109271]) +15 similar issues
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-snb4/igt@kms_plane_scaling@plane-upscale-with-modifiers-factor-0-25@pipe-a-vga-1.html

  
#### Possible fixes ####

  * igt@gem_eio@reset-stress:
    - {shard-dg1}:        [FAIL][20] ([i915#5784]) -> [PASS][21]
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-dg1-18/igt@gem_eio@reset-stress.html
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-dg1-18/igt@gem_eio@reset-stress.html

  * igt@gem_exec_fair@basic-none-solo@rcs0:
    - shard-apl:          [FAIL][22] ([i915#2842]) -> [PASS][23]
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-apl2/igt@gem_exec_fair@basic-none-solo@rcs0.html
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-apl2/igt@gem_exec_fair@basic-none-solo@rcs0.html

  * igt@gem_exec_fair@basic-pace@rcs0:
    - {shard-rkl}:        [FAIL][24] ([i915#2842]) -> [PASS][25]
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-rkl-2/igt@gem_exec_fair@basic-pace@rcs0.html
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-rkl-3/igt@gem_exec_fair@basic-pace@rcs0.html

  * igt@gem_lmem_swapping@smem-oom@lmem0:
    - {shard-dg1}:        [TIMEOUT][26] ([i915#5493]) -> [PASS][27]
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-dg1-18/igt@gem_lmem_swapping@smem-oom@lmem0.html
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-dg1-18/igt@gem_lmem_swapping@smem-oom@lmem0.html

  * igt@gen9_exec_parse@allowed-single:
    - shard-glk:          [ABORT][28] ([i915#5566]) -> [PASS][29]
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-glk9/igt@gen9_exec_parse@allowed-single.html
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk3/igt@gen9_exec_parse@allowed-single.html

  * igt@i915_hangman@gt-engine-hang@rcs0:
    - shard-glk:          [INCOMPLETE][30] -> [PASS][31]
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-glk3/igt@i915_hangman@gt-engine-hang@rcs0.html
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@i915_hangman@gt-engine-hang@rcs0.html

  * igt@i915_pm_rc6_residency@rc6-idle@rcs0:
    - {shard-dg1}:        [FAIL][32] ([i915#3591]) -> [PASS][33]
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-dg1-16/igt@i915_pm_rc6_residency@rc6-idle@rcs0.html
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-dg1-19/igt@i915_pm_rc6_residency@rc6-idle@rcs0.html

  * igt@i915_pm_rpm@modeset-lpsp-stress-no-wait:
    - {shard-rkl}:        [SKIP][34] ([i915#1397]) -> [PASS][35]
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-rkl-1/igt@i915_pm_rpm@modeset-lpsp-stress-no-wait.html
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-rkl-7/igt@i915_pm_rpm@modeset-lpsp-stress-no-wait.html

  * igt@i915_pm_rpm@modeset-non-lpsp-stress:
    - {shard-dg1}:        [SKIP][36] ([i915#1397]) -> [PASS][37] +1 similar issue
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-dg1-19/igt@i915_pm_rpm@modeset-non-lpsp-stress.html
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-dg1-13/igt@i915_pm_rpm@modeset-non-lpsp-stress.html

  * igt@i915_selftest@live@gt_heartbeat:
    - shard-apl:          [DMESG-FAIL][38] ([i915#5334]) -> [PASS][39]
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-apl1/igt@i915_selftest@live@gt_heartbeat.html
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-apl4/igt@i915_selftest@live@gt_heartbeat.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions:
    - shard-glk:          [FAIL][40] ([i915#2346]) -> [PASS][41]
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-glk2/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html
    - shard-apl:          [FAIL][42] ([i915#2346]) -> [PASS][43]
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-apl2/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-apl3/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html

  * igt@kms_cursor_legacy@single-move@pipe-b:
    - {shard-dg1}:        [INCOMPLETE][44] ([i915#8011] / [i915#8347]) -> [PASS][45]
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-dg1-19/igt@kms_cursor_legacy@single-move@pipe-b.html
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-dg1-16/igt@kms_cursor_legacy@single-move@pipe-b.html

  * igt@kms_flip@2x-flip-vs-expired-vblank-interruptible@bc-hdmi-a1-hdmi-a2:
    - shard-glk:          [FAIL][46] ([i915#79]) -> [PASS][47]
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-glk3/igt@kms_flip@2x-flip-vs-expired-vblank-interruptible@bc-hdmi-a1-hdmi-a2.html
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk6/igt@kms_flip@2x-flip-vs-expired-vblank-interruptible@bc-hdmi-a1-hdmi-a2.html

  * igt@kms_rotation_crc@multiplane-rotation-cropping-top:
    - shard-glk:          [DMESG-FAIL][48] ([i915#118]) -> [PASS][49]
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-glk3/igt@kms_rotation_crc@multiplane-rotation-cropping-top.html
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-glk9/igt@kms_rotation_crc@multiplane-rotation-cropping-top.html

  * igt@kms_rotation_crc@sprite-rotation-90:
    - {shard-rkl}:        [ABORT][50] ([i915#8178] / [i915#8311]) -> [PASS][51]
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_13246/shard-rkl-6/igt@kms_rotation_crc@sprite-rotation-90.html
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/shard-rkl-3/igt@kms_rotation_crc@sprite-rotation-90.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109289]: https://bugs.freedesktop.org/show_bug.cgi?id=109289
  [fdo#109313]: https://bugs.freedesktop.org/show_bug.cgi?id=109313
  [fdo#109315]: https://bugs.freedesktop.org/show_bug.cgi?id=109315
  [fdo#110189]: https://bugs.freedesktop.org/show_bug.cgi?id=110189
  [fdo#110723]: https://bugs.freedesktop.org/show_bug.cgi?id=110723
  [fdo#111068]: https://bugs.freedesktop.org/show_bug.cgi?id=111068
  [fdo#111614]: https://bugs.freedesktop.org/show_bug.cgi?id=111614
  [fdo#111825]: https://bugs.freedesktop.org/show_bug.cgi?id=111825
  [i915#1072]: https://gitlab.freedesktop.org/drm/intel/issues/1072
  [i915#118]: https://gitlab.freedesktop.org/drm/intel/issues/118
  [i915#1188]: https://gitlab.freedesktop.org/drm/intel/issues/1188
  [i915#1397]: https://gitlab.freedesktop.org/drm/intel/issues/1397
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#1825]: https://gitlab.freedesktop.org/drm/intel/issues/1825
  [i915#1937]: https://gitlab.freedesktop.org/drm/intel/issues/1937
  [i915#2346]: https://gitlab.freedesktop.org/drm/intel/issues/2346
  [i915#2527]: https://gitlab.freedesktop.org/drm/intel/issues/2527
  [i915#2672]: https://gitlab.freedesktop.org/drm/intel/issues/2672
  [i915#2681]: https://gitlab.freedesktop.org/drm/intel/issues/2681
  [i915#2842]: https://gitlab.freedesktop.org/drm/intel/issues/2842
  [i915#3023]: https://gitlab.freedesktop.org/drm/intel/issues/3023
  [i915#3281]: https://gitlab.freedesktop.org/drm/intel/issues/3281
  [i915#3282]: https://gitlab.freedesktop.org/drm/intel/issues/3282
  [i915#3297]: https://gitlab.freedesktop.org/drm/intel/issues/3297
  [i915#3323]: https://gitlab.freedesktop.org/drm/intel/issues/3323
  [i915#3555]: https://gitlab.freedesktop.org/drm/intel/issues/3555
  [i915#3591]: https://gitlab.freedesktop.org/drm/intel/issues/3591
  [i915#3638]: https://gitlab.freedesktop.org/drm/intel/issues/3638
  [i915#3689]: https://gitlab.freedesktop.org/drm/intel/issues/3689
  [i915#3734]: https://gitlab.freedesktop.org/drm/intel/issues/3734
  [i915#3886]: https://gitlab.freedesktop.org/drm/intel/issues/3886
  [i915#3955]: https://gitlab.freedesktop.org/drm/intel/issues/3955
  [i915#4070]: https://gitlab.freedesktop.org/drm/intel/issues/4070
  [i915#4078]: https://gitlab.freedesktop.org/drm/intel/issues/4078
  [i915#4528]: https://gitlab.freedesktop.org/drm/intel/issues/4528
  [i915#4579]: https://gitlab.freedesktop.org/drm/intel/issues/4579
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4816]: https://gitlab.freedesktop.org/drm/intel/issues/4816
  [i915#4833]: https://gitlab.freedesktop.org/drm/intel/issues/4833
  [i915#5176]: https://gitlab.freedesktop.org/drm/intel/issues/5176
  [i915#5235]: https://gitlab.freedesktop.org/drm/intel/issues/5235
  [i915#5286]: https://gitlab.freedesktop.org/drm/intel/issues/5286
  [i915#533]: https://gitlab.freedesktop.org/drm/intel/issues/533
  [i915#5334]: https://gitlab.freedesktop.org/drm/intel/issues/5334
  [i915#5354]: https://gitlab.freedesktop.org/drm/intel/issues/5354
  [i915#5493]: https://gitlab.freedesktop.org/drm/intel/issues/5493
  [i915#5566]: https://gitlab.freedesktop.org/drm/intel/issues/5566
  [i915#5784]: https://gitlab.freedesktop.org/drm/intel/issues/5784
  [i915#6095]: https://gitlab.freedesktop.org/drm/intel/issues/6095
  [i915#6268]: https://gitlab.freedesktop.org/drm/intel/issues/6268
  [i915#6524]: https://gitlab.freedesktop.org/drm/intel/issues/6524
  [i915#658]: https://gitlab.freedesktop.org/drm/intel/issues/658
  [i915#6590]: https://gitlab.freedesktop.org/drm/intel/issues/6590
  [i915#6768]: https://gitlab.freedesktop.org/drm/intel/issues/6768
  [i915#7711]: https://gitlab.freedesktop.org/drm/intel/issues/7711
  [i915#7742]: https://gitlab.freedesktop.org/drm/intel/issues/7742
  [i915#7828]: https://gitlab.freedesktop.org/drm/intel/issues/7828
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79
  [i915#8011]: https://gitlab.freedesktop.org/drm/intel/issues/8011
  [i915#8178]: https://gitlab.freedesktop.org/drm/intel/issues/8178
  [i915#8292]: https://gitlab.freedesktop.org/drm/intel/issues/8292
  [i915#8311]: https://gitlab.freedesktop.org/drm/intel/issues/8311
  [i915#8347]: https://gitlab.freedesktop.org/drm/intel/issues/8347


Build changes
-------------

  * Linux: CI_DRM_13246 -> Patchwork_118898v2

  CI-20190529: 20190529
  CI_DRM_13246: 140b8a06031370fc0cf0ef5a5ca8cca0a4083951 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_7321: f52cfd53f353fdaca537c810fbc35e09ffd07345 @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_118898v2: 140b8a06031370fc0cf0ef5a5ca8cca0a4083951 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_118898v2/index.html

[-- Attachment #2: Type: text/html, Size: 15586 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
@ 2023-06-06  3:00 Zhanjun Dong
  2023-06-06 17:53 ` John Harrison
  2023-06-07 13:57 ` Andrzej Hajda
  0 siblings, 2 replies; 12+ messages in thread
From: Zhanjun Dong @ 2023-06-06  3:00 UTC (permalink / raw)
  To: intel-gfx, dri-devel

This attemps to avoid circular locing dependency between flush delayed work and intel_gt_reset.

WARNING: possible circular locking dependency detected
6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
------------------------------------------------------
kms_pipe_crc_ba/6415 is trying to acquire lock:
ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530

but task is already holding lock:
ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&gt->reset.mutex){+.+.}-{3:3}:
        lock_acquire+0xd8/0x2d0
        i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
        intel_gt_init_reset+0x65/0x80 [i915]
        intel_gt_common_init_early+0xe1/0x170 [i915]
        intel_root_gt_init_early+0x48/0x60 [i915]
        i915_driver_probe+0x671/0xcb0 [i915]
        i915_pci_probe+0xdc/0x210 [i915]
        pci_device_probe+0x95/0x120
        really_probe+0x164/0x3c0
        __driver_probe_device+0x73/0x160
        driver_probe_device+0x19/0xa0
        __driver_attach+0xb6/0x180
        bus_for_each_dev+0x77/0xd0
        bus_add_driver+0x114/0x210
        driver_register+0x5b/0x110
        __pfx_vgem_open+0x3/0x10 [vgem]
        do_one_initcall+0x57/0x270
        do_init_module+0x5f/0x220
        load_module+0x1ca4/0x1f00
        __do_sys_finit_module+0xb4/0x130
        do_syscall_64+0x3c/0x90
        entry_SYSCALL_64_after_hwframe+0x72/0xdc

-> #2 (fs_reclaim){+.+.}-{0:0}:
        lock_acquire+0xd8/0x2d0
        fs_reclaim_acquire+0xac/0xe0
        kmem_cache_alloc+0x32/0x260
        i915_vma_instance+0xb2/0xc60 [i915]
        i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
        vm_fault_gtt+0x22d/0xf60 [i915]
        __do_fault+0x2f/0x1d0
        do_pte_missing+0x4a/0xd20
        __handle_mm_fault+0x5b0/0x790
        handle_mm_fault+0xa2/0x230
        do_user_addr_fault+0x3ea/0xa10
        exc_page_fault+0x68/0x1a0
        asm_exc_page_fault+0x26/0x30

-> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
        lock_acquire+0xd8/0x2d0
        _intel_gt_reset_lock+0x57/0x330 [i915]
        guc_timestamp_ping+0x35/0x130 [i915]
        process_one_work+0x250/0x510
        worker_thread+0x4f/0x3a0
        kthread+0xff/0x130
        ret_from_fork+0x29/0x50

-> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
        check_prev_add+0x90/0xc60
        __lock_acquire+0x1998/0x2590
        lock_acquire+0xd8/0x2d0
        __flush_work+0x74/0x530
        __cancel_work_timer+0x14f/0x1f0
        intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
        intel_uc_reset_prepare+0x9c/0x120 [i915]
        reset_prepare+0x21/0x60 [i915]
        intel_gt_reset+0x1dd/0x470 [i915]
        intel_gt_reset_global+0xfb/0x170 [i915]
        intel_gt_handle_error+0x368/0x420 [i915]
        intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
        i915_wedged_set+0x29/0x40 [i915]
        simple_attr_write_xsigned.constprop.0+0xb4/0x110
        full_proxy_write+0x52/0x80
        vfs_write+0xc5/0x4f0
        ksys_write+0x64/0xe0
        do_syscall_64+0x3c/0x90
        entry_SYSCALL_64_after_hwframe+0x72/0xdc

other info that might help us debug this:
 Chain exists of:
  (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
  Possible unsafe locking scenario:
        CPU0                    CPU1
        ----                    ----
   lock(&gt->reset.mutex);
                                lock(fs_reclaim);
                                lock(&gt->reset.mutex);
   lock((work_completion)(&(&guc->timestamp.work)->work));

 *** DEADLOCK ***
 3 locks held by kms_pipe_crc_ba/6415:
  #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
  #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
  #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]

Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index a0e3ef1c65d2..22390704542e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1359,7 +1359,7 @@ static void guc_enable_busyness_worker(struct intel_guc *guc)
 
 static void guc_cancel_busyness_worker(struct intel_guc *guc)
 {
-	cancel_delayed_work_sync(&guc->timestamp.work);
+	cancel_delayed_work(&guc->timestamp.work);
 }
 
 static void __reset_guc_busyness_stats(struct intel_guc *guc)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-06  3:00 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
@ 2023-06-06 17:53 ` John Harrison
  2023-06-07 17:30   ` John Harrison
  2023-06-07 13:57 ` Andrzej Hajda
  1 sibling, 1 reply; 12+ messages in thread
From: John Harrison @ 2023-06-06 17:53 UTC (permalink / raw)
  To: Zhanjun Dong, intel-gfx, dri-devel

On 6/5/2023 20:00, Zhanjun Dong wrote:
> This attemps to avoid circular locing dependency between flush delayed work and intel_gt_reset.
locing -> locking


>
> WARNING: possible circular locking dependency detected
> 6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
> ------------------------------------------------------
> kms_pipe_crc_ba/6415 is trying to acquire lock:
> ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530
>
> but task is already holding lock:
> ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
>
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&gt->reset.mutex){+.+.}-{3:3}:
>          lock_acquire+0xd8/0x2d0
>          i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
>          intel_gt_init_reset+0x65/0x80 [i915]
>          intel_gt_common_init_early+0xe1/0x170 [i915]
>          intel_root_gt_init_early+0x48/0x60 [i915]
>          i915_driver_probe+0x671/0xcb0 [i915]
>          i915_pci_probe+0xdc/0x210 [i915]
>          pci_device_probe+0x95/0x120
>          really_probe+0x164/0x3c0
>          __driver_probe_device+0x73/0x160
>          driver_probe_device+0x19/0xa0
>          __driver_attach+0xb6/0x180
>          bus_for_each_dev+0x77/0xd0
>          bus_add_driver+0x114/0x210
>          driver_register+0x5b/0x110
>          __pfx_vgem_open+0x3/0x10 [vgem]
>          do_one_initcall+0x57/0x270
>          do_init_module+0x5f/0x220
>          load_module+0x1ca4/0x1f00
>          __do_sys_finit_module+0xb4/0x130
>          do_syscall_64+0x3c/0x90
>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> -> #2 (fs_reclaim){+.+.}-{0:0}:
>          lock_acquire+0xd8/0x2d0
>          fs_reclaim_acquire+0xac/0xe0
>          kmem_cache_alloc+0x32/0x260
>          i915_vma_instance+0xb2/0xc60 [i915]
>          i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
>          vm_fault_gtt+0x22d/0xf60 [i915]
>          __do_fault+0x2f/0x1d0
>          do_pte_missing+0x4a/0xd20
>          __handle_mm_fault+0x5b0/0x790
>          handle_mm_fault+0xa2/0x230
>          do_user_addr_fault+0x3ea/0xa10
>          exc_page_fault+0x68/0x1a0
>          asm_exc_page_fault+0x26/0x30
>
> -> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
>          lock_acquire+0xd8/0x2d0
>          _intel_gt_reset_lock+0x57/0x330 [i915]
>          guc_timestamp_ping+0x35/0x130 [i915]
>          process_one_work+0x250/0x510
>          worker_thread+0x4f/0x3a0
>          kthread+0xff/0x130
>          ret_from_fork+0x29/0x50
>
> -> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
>          check_prev_add+0x90/0xc60
>          __lock_acquire+0x1998/0x2590
>          lock_acquire+0xd8/0x2d0
>          __flush_work+0x74/0x530
>          __cancel_work_timer+0x14f/0x1f0
>          intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
>          intel_uc_reset_prepare+0x9c/0x120 [i915]
>          reset_prepare+0x21/0x60 [i915]
>          intel_gt_reset+0x1dd/0x470 [i915]
>          intel_gt_reset_global+0xfb/0x170 [i915]
>          intel_gt_handle_error+0x368/0x420 [i915]
>          intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
>          i915_wedged_set+0x29/0x40 [i915]
>          simple_attr_write_xsigned.constprop.0+0xb4/0x110
>          full_proxy_write+0x52/0x80
>          vfs_write+0xc5/0x4f0
>          ksys_write+0x64/0xe0
>          do_syscall_64+0x3c/0x90
>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> other info that might help us debug this:
>   Chain exists of:
>    (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
>    Possible unsafe locking scenario:
>          CPU0                    CPU1
>          ----                    ----
>     lock(&gt->reset.mutex);
>                                  lock(fs_reclaim);
>                                  lock(&gt->reset.mutex);
>     lock((work_completion)(&(&guc->timestamp.work)->work));
>
>   *** DEADLOCK ***
>   3 locks held by kms_pipe_crc_ba/6415:
>    #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
>    #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
>    #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
>
> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index a0e3ef1c65d2..22390704542e 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1359,7 +1359,7 @@ static void guc_enable_busyness_worker(struct intel_guc *guc)
>   
>   static void guc_cancel_busyness_worker(struct intel_guc *guc)
>   {
> -	cancel_delayed_work_sync(&guc->timestamp.work);
> +	cancel_delayed_work(&guc->timestamp.work);
I think it is worth adding a comment here to explain that it is safe to 
call the non _sync variant (because of the trylock code in the worker 
itself) and that the _sync variant hits circular mutex lock issues.

John.


>   }
>   
>   static void __reset_guc_busyness_stats(struct intel_guc *guc)


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-06 17:53 ` John Harrison
@ 2023-06-07 17:30   ` John Harrison
  0 siblings, 0 replies; 12+ messages in thread
From: John Harrison @ 2023-06-07 17:30 UTC (permalink / raw)
  To: Zhanjun Dong, intel-gfx, dri-devel

On 6/6/2023 10:53, John Harrison wrote:
> On 6/5/2023 20:00, Zhanjun Dong wrote:
>> This attemps to avoid circular locing dependency between flush 
>> delayed work and intel_gt_reset.
> locing -> locking
>
>
>>
>> WARNING: possible circular locking dependency detected
>> 6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
>> ------------------------------------------------------
>> kms_pipe_crc_ba/6415 is trying to acquire lock:
>> ffff88813e6cc640 
>> ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: 
>> __flush_work+0x42/0x530
>>
>> but task is already holding lock:
>> ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: 
>> intel_gt_reset+0x19e/0x470 [i915]
>>
>> which lock already depends on the new lock.
>>
>> the existing dependency chain (in reverse order) is:
>>
>> -> #3 (&gt->reset.mutex){+.+.}-{3:3}:
>>          lock_acquire+0xd8/0x2d0
>>          i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
>>          intel_gt_init_reset+0x65/0x80 [i915]
>>          intel_gt_common_init_early+0xe1/0x170 [i915]
>>          intel_root_gt_init_early+0x48/0x60 [i915]
>>          i915_driver_probe+0x671/0xcb0 [i915]
>>          i915_pci_probe+0xdc/0x210 [i915]
>>          pci_device_probe+0x95/0x120
>>          really_probe+0x164/0x3c0
>>          __driver_probe_device+0x73/0x160
>>          driver_probe_device+0x19/0xa0
>>          __driver_attach+0xb6/0x180
>>          bus_for_each_dev+0x77/0xd0
>>          bus_add_driver+0x114/0x210
>>          driver_register+0x5b/0x110
>>          __pfx_vgem_open+0x3/0x10 [vgem]
>>          do_one_initcall+0x57/0x270
>>          do_init_module+0x5f/0x220
>>          load_module+0x1ca4/0x1f00
>>          __do_sys_finit_module+0xb4/0x130
>>          do_syscall_64+0x3c/0x90
>>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>
>> -> #2 (fs_reclaim){+.+.}-{0:0}:
>>          lock_acquire+0xd8/0x2d0
>>          fs_reclaim_acquire+0xac/0xe0
>>          kmem_cache_alloc+0x32/0x260
>>          i915_vma_instance+0xb2/0xc60 [i915]
>>          i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
>>          vm_fault_gtt+0x22d/0xf60 [i915]
>>          __do_fault+0x2f/0x1d0
>>          do_pte_missing+0x4a/0xd20
>>          __handle_mm_fault+0x5b0/0x790
>>          handle_mm_fault+0xa2/0x230
>>          do_user_addr_fault+0x3ea/0xa10
>>          exc_page_fault+0x68/0x1a0
>>          asm_exc_page_fault+0x26/0x30
>>
>> -> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
>>          lock_acquire+0xd8/0x2d0
>>          _intel_gt_reset_lock+0x57/0x330 [i915]
>>          guc_timestamp_ping+0x35/0x130 [i915]
>>          process_one_work+0x250/0x510
>>          worker_thread+0x4f/0x3a0
>>          kthread+0xff/0x130
>>          ret_from_fork+0x29/0x50
>>
>> -> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
>>          check_prev_add+0x90/0xc60
>>          __lock_acquire+0x1998/0x2590
>>          lock_acquire+0xd8/0x2d0
>>          __flush_work+0x74/0x530
>>          __cancel_work_timer+0x14f/0x1f0
>>          intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
>>          intel_uc_reset_prepare+0x9c/0x120 [i915]
>>          reset_prepare+0x21/0x60 [i915]
>>          intel_gt_reset+0x1dd/0x470 [i915]
>>          intel_gt_reset_global+0xfb/0x170 [i915]
>>          intel_gt_handle_error+0x368/0x420 [i915]
>>          intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
>>          i915_wedged_set+0x29/0x40 [i915]
>>          simple_attr_write_xsigned.constprop.0+0xb4/0x110
>>          full_proxy_write+0x52/0x80
>>          vfs_write+0xc5/0x4f0
>>          ksys_write+0x64/0xe0
>>          do_syscall_64+0x3c/0x90
>>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
>>
>> other info that might help us debug this:
>>   Chain exists of:
>> (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> 
>> &gt->reset.mutex
>>    Possible unsafe locking scenario:
>>          CPU0                    CPU1
>>          ----                    ----
>>     lock(&gt->reset.mutex);
>>                                  lock(fs_reclaim);
>>                                  lock(&gt->reset.mutex);
>> lock((work_completion)(&(&guc->timestamp.work)->work));
>>
>>   *** DEADLOCK ***
>>   3 locks held by kms_pipe_crc_ba/6415:
>>    #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: 
>> ksys_write+0x64/0xe0
>>    #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: 
>> simple_attr_write_xsigned.constprop.0+0x47/0x110
>>    #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: 
>> intel_gt_reset+0x19e/0x470 [i915]
>>
>> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index a0e3ef1c65d2..22390704542e 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -1359,7 +1359,7 @@ static void guc_enable_busyness_worker(struct 
>> intel_guc *guc)
>>     static void guc_cancel_busyness_worker(struct intel_guc *guc)
>>   {
>> -    cancel_delayed_work_sync(&guc->timestamp.work);
>> +    cancel_delayed_work(&guc->timestamp.work);
> I think it is worth adding a comment here to explain that it is safe 
> to call the non _sync variant (because of the trylock code in the 
> worker itself) and that the _sync variant hits circular mutex lock 
> issues.
>
To record the notes from direct discussion... This function needs to 
take a 'bool sync' flag. The park/fini code paths still need to do a 
synchronous flush to ensure that the worker is not still running during 
suspend or shutdown. Only the reset path should do the asynchronous cancel.

John.

> John.
>
>
>>   }
>>     static void __reset_guc_busyness_stats(struct intel_guc *guc)
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset
  2023-06-06  3:00 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
  2023-06-06 17:53 ` John Harrison
@ 2023-06-07 13:57 ` Andrzej Hajda
  1 sibling, 0 replies; 12+ messages in thread
From: Andrzej Hajda @ 2023-06-07 13:57 UTC (permalink / raw)
  To: Zhanjun Dong, intel-gfx, dri-devel

On 06.06.2023 05:00, Zhanjun Dong wrote:
> This attemps to avoid circular locing dependency between flush delayed work and intel_gt_reset.
> 
> WARNING: possible circular locking dependency detected
> 6.4.0-rc1-drmtip_1340-g31e3463b0edb+ #1 Not tainted
> ------------------------------------------------------
> kms_pipe_crc_ba/6415 is trying to acquire lock:
> ffff88813e6cc640 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}, at: __flush_work+0x42/0x530
> 
> but task is already holding lock:
> ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
> 
> which lock already depends on the new lock.
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #3 (&gt->reset.mutex){+.+.}-{3:3}:
>          lock_acquire+0xd8/0x2d0
>          i915_gem_shrinker_taints_mutex+0x31/0x50 [i915]
>          intel_gt_init_reset+0x65/0x80 [i915]
>          intel_gt_common_init_early+0xe1/0x170 [i915]
>          intel_root_gt_init_early+0x48/0x60 [i915]
>          i915_driver_probe+0x671/0xcb0 [i915]
>          i915_pci_probe+0xdc/0x210 [i915]
>          pci_device_probe+0x95/0x120
>          really_probe+0x164/0x3c0
>          __driver_probe_device+0x73/0x160
>          driver_probe_device+0x19/0xa0
>          __driver_attach+0xb6/0x180
>          bus_for_each_dev+0x77/0xd0
>          bus_add_driver+0x114/0x210
>          driver_register+0x5b/0x110
>          __pfx_vgem_open+0x3/0x10 [vgem]
>          do_one_initcall+0x57/0x270
>          do_init_module+0x5f/0x220
>          load_module+0x1ca4/0x1f00
>          __do_sys_finit_module+0xb4/0x130
>          do_syscall_64+0x3c/0x90
>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
> 
> -> #2 (fs_reclaim){+.+.}-{0:0}:
>          lock_acquire+0xd8/0x2d0
>          fs_reclaim_acquire+0xac/0xe0
>          kmem_cache_alloc+0x32/0x260
>          i915_vma_instance+0xb2/0xc60 [i915]
>          i915_gem_object_ggtt_pin_ww+0x175/0x370 [i915]
>          vm_fault_gtt+0x22d/0xf60 [i915]
>          __do_fault+0x2f/0x1d0
>          do_pte_missing+0x4a/0xd20
>          __handle_mm_fault+0x5b0/0x790
>          handle_mm_fault+0xa2/0x230
>          do_user_addr_fault+0x3ea/0xa10
>          exc_page_fault+0x68/0x1a0
>          asm_exc_page_fault+0x26/0x30
> 
> -> #1 (&gt->reset.backoff_srcu){++++}-{0:0}:
>          lock_acquire+0xd8/0x2d0
>          _intel_gt_reset_lock+0x57/0x330 [i915]
>          guc_timestamp_ping+0x35/0x130 [i915]
>          process_one_work+0x250/0x510
>          worker_thread+0x4f/0x3a0
>          kthread+0xff/0x130
>          ret_from_fork+0x29/0x50
> 
> -> #0 ((work_completion)(&(&guc->timestamp.work)->work)){+.+.}-{0:0}:
>          check_prev_add+0x90/0xc60
>          __lock_acquire+0x1998/0x2590
>          lock_acquire+0xd8/0x2d0
>          __flush_work+0x74/0x530
>          __cancel_work_timer+0x14f/0x1f0
>          intel_guc_submission_reset_prepare+0x81/0x4b0 [i915]
>          intel_uc_reset_prepare+0x9c/0x120 [i915]
>          reset_prepare+0x21/0x60 [i915]
>          intel_gt_reset+0x1dd/0x470 [i915]
>          intel_gt_reset_global+0xfb/0x170 [i915]
>          intel_gt_handle_error+0x368/0x420 [i915]
>          intel_gt_debugfs_reset_store+0x5c/0xc0 [i915]
>          i915_wedged_set+0x29/0x40 [i915]
>          simple_attr_write_xsigned.constprop.0+0xb4/0x110
>          full_proxy_write+0x52/0x80
>          vfs_write+0xc5/0x4f0
>          ksys_write+0x64/0xe0
>          do_syscall_64+0x3c/0x90
>          entry_SYSCALL_64_after_hwframe+0x72/0xdc
> 
> other info that might help us debug this:
>   Chain exists of:
>    (work_completion)(&(&guc->timestamp.work)->work) --> fs_reclaim --> &gt->reset.mutex
>    Possible unsafe locking scenario:
>          CPU0                    CPU1
>          ----                    ----
>     lock(&gt->reset.mutex);
>                                  lock(fs_reclaim);
>                                  lock(&gt->reset.mutex);
>     lock((work_completion)(&(&guc->timestamp.work)->work));
> 
>   *** DEADLOCK ***
>   3 locks held by kms_pipe_crc_ba/6415:
>    #0: ffff888101541430 (sb_writers#15){.+.+}-{0:0}, at: ksys_write+0x64/0xe0
>    #1: ffff888136c7eab8 (&attr->mutex){+.+.}-{3:3}, at: simple_attr_write_xsigned.constprop.0+0x47/0x110
>    #2: ffff88813e6cce90 (&gt->reset.mutex){+.+.}-{3:3}, at: intel_gt_reset+0x19e/0x470 [i915]
> 
> Signed-off-by: Zhanjun Dong <zhanjun.dong@intel.com>

This unlocks multiple machines on CI, thx.

Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>

Regards
Andrzej

> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index a0e3ef1c65d2..22390704542e 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1359,7 +1359,7 @@ static void guc_enable_busyness_worker(struct intel_guc *guc)
>   
>   static void guc_cancel_busyness_worker(struct intel_guc *guc)
>   {
> -	cancel_delayed_work_sync(&guc->timestamp.work);
> +	cancel_delayed_work(&guc->timestamp.work);
>   }
>   
>   static void __reset_guc_busyness_stats(struct intel_guc *guc)


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-06-08 22:01 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-06-07 19:03 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
2023-06-07 20:17 ` John Harrison
2023-06-08 21:31   ` Dong, Zhanjun
2023-06-08  0:19 ` Andi Shyti
2023-06-08 21:17   ` Dong, Zhanjun
2023-06-08  2:34 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Avoid circular locking dependency when flush delayed work on gt reset (rev2) Patchwork
2023-06-08  2:57 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2023-06-08 22:01 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2023-06-06  3:00 [Intel-gfx] [PATCH] drm/i915: Avoid circular locking dependency when flush delayed work on gt reset Zhanjun Dong
2023-06-06 17:53 ` John Harrison
2023-06-07 17:30   ` John Harrison
2023-06-07 13:57 ` Andrzej Hajda

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox