[PATCH 0/2] xfs: make xfs allocation workqueue per-mount, and high priority

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] xfs: make xfs allocation workqueue per-mount, and high priority
@ 2015-01-09 18:08 Eric Sandeen
  2015-01-09 18:10 ` [PATCH 1/2] xfs: make global xfsalloc workqueue per-mount Eric Sandeen
  2015-01-09 18:12 ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Eric Sandeen
  0 siblings, 2 replies; 25+ messages in thread
From: Eric Sandeen @ 2015-01-09 18:08 UTC (permalink / raw)
  To: xfs-oss

See subsequent patch commit logs for explanation...

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/2] xfs: make global xfsalloc workqueue per-mount
  2015-01-09 18:08 [PATCH 0/2] xfs: make xfs allocation workqueue per-mount, and high priority Eric Sandeen
@ 2015-01-09 18:10 ` Eric Sandeen
  2015-01-12 15:35   ` Brian Foster
  2015-01-09 18:12 ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Eric Sandeen
  1 sibling, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-09 18:10 UTC (permalink / raw)
  To: Eric Sandeen, xfs-oss

The xfsalloc workquaue is the last remaining global work
queue, something we've been moving away from in general.
Make this one per-mount like every other workqueue.

This also renames it from "xfsalloc" to "xfs-alloc" to
match the naming convention.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a6fbf44..72d28d1 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -36,8 +36,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 
-struct workqueue_struct *xfs_alloc_wq;
-
 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
 
 #define	XFSA_FIXUP_BNO_OK	1
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d1b4b6a..39933a1 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -24,8 +24,6 @@ struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
 
-extern struct workqueue_struct *xfs_alloc_wq;
-
 /*
  * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
  */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 81cad43..a8a1369 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2574,7 +2574,7 @@ xfs_btree_split(
 	args.done = &done;
 	args.kswapd = current_is_kswapd();
 	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
-	queue_work(xfs_alloc_wq, &args.work);
+	queue_work(cur->bc_mp->m_alloc_workqueue, &args.work);
 	wait_for_completion(&done);
 	destroy_work_on_stack(&args.work);
 	return args.result;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69..7d57a5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -175,6 +175,7 @@ typedef struct xfs_mount {
 	struct workqueue_struct	*m_reclaim_workqueue;
 	struct workqueue_struct	*m_log_workqueue;
 	struct workqueue_struct *m_eofblocks_workqueue;
+	struct workqueue_struct *m_alloc_workqueue;
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19cbda1..e5bdca9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -873,8 +873,15 @@ xfs_init_mount_workqueues(
 	if (!mp->m_eofblocks_workqueue)
 		goto out_destroy_log;
 
+	mp->m_alloc_workqueue = alloc_workqueue("xfs-alloc/%s",
+			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+	if (!mp->m_alloc_workqueue)
+		goto out_destroy_eofblocks;
+
 	return 0;
 
+out_destroy_eofblocks:
+	destroy_workqueue(mp->m_eofblocks_workqueue);
 out_destroy_log:
 	destroy_workqueue(mp->m_log_workqueue);
 out_destroy_reclaim:
@@ -895,6 +902,7 @@ STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
 {
+	destroy_workqueue(mp->m_alloc_workqueue);
 	destroy_workqueue(mp->m_eofblocks_workqueue);
 	destroy_workqueue(mp->m_log_workqueue);
 	destroy_workqueue(mp->m_reclaim_workqueue);
@@ -1717,29 +1725,6 @@ xfs_destroy_zones(void)
 }
 
 STATIC int __init
-xfs_init_workqueues(void)
-{
-	/*
-	 * The allocation workqueue can be used in memory reclaim situations
-	 * (writepage path), and parallelism is only limited by the number of
-	 * AGs in all the filesystems mounted. Hence use the default large
-	 * max_active value for this workqueue.
-	 */
-	xfs_alloc_wq = alloc_workqueue("xfsalloc",
-			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
-	if (!xfs_alloc_wq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-STATIC void
-xfs_destroy_workqueues(void)
-{
-	destroy_workqueue(xfs_alloc_wq);
-}
-
-STATIC int __init
 init_xfs_fs(void)
 {
 	int			error;
@@ -1753,13 +1738,9 @@ init_xfs_fs(void)
 	if (error)
 		goto out;
 
-	error = xfs_init_workqueues();
-	if (error)
-		goto out_destroy_zones;
-
 	error = xfs_mru_cache_init();
 	if (error)
-		goto out_destroy_wq;
+		goto out_destroy_zones;
 
 	error = xfs_buf_init();
 	if (error)
@@ -1776,7 +1757,7 @@ init_xfs_fs(void)
 	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
 	if (!xfs_kset) {
 		error = -ENOMEM;
-		goto out_sysctl_unregister;;
+		goto out_sysctl_unregister;
 	}
 
 #ifdef DEBUG
@@ -1795,27 +1776,25 @@ init_xfs_fs(void)
 		goto out_qm_exit;
 	return 0;
 
- out_qm_exit:
+out_qm_exit:
 	xfs_qm_exit();
- out_remove_kobj:
+out_remove_kobj:
 #ifdef DEBUG
 	xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+out_kset_unregister:
 #endif
 	kset_unregister(xfs_kset);
- out_sysctl_unregister:
+out_sysctl_unregister:
 	xfs_sysctl_unregister();
- out_cleanup_procfs:
+out_cleanup_procfs:
 	xfs_cleanup_procfs();
- out_buf_terminate:
+out_buf_terminate:
 	xfs_buf_terminate();
- out_mru_cache_uninit:
+out_mru_cache_uninit:
 	xfs_mru_cache_uninit();
- out_destroy_wq:
-	xfs_destroy_workqueues();
- out_destroy_zones:
+out_destroy_zones:
 	xfs_destroy_zones();
- out:
+out:
 	return error;
 }
 
@@ -1832,7 +1811,6 @@ exit_xfs_fs(void)
 	xfs_cleanup_procfs();
 	xfs_buf_terminate();
 	xfs_mru_cache_uninit();
-	xfs_destroy_workqueues();
 	xfs_destroy_zones();
 }
 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-09 18:08 [PATCH 0/2] xfs: make xfs allocation workqueue per-mount, and high priority Eric Sandeen
  2015-01-09 18:10 ` [PATCH 1/2] xfs: make global xfsalloc workqueue per-mount Eric Sandeen
@ 2015-01-09 18:12 ` Eric Sandeen
  2015-01-09 18:23   ` Tejun Heo
  1 sibling, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-09 18:12 UTC (permalink / raw)
  To: Eric Sandeen, xfs-oss; +Cc: Tejun Heo

I had a case reported where a system under high stress
got deadlocked.  A btree split was handed off to the xfs
allocation workqueue, and it is holding the xfs_ilock
exclusively.  However, other xfs_end_io workers are
not running, because they are waiting for that lock.
As a result, the xfs allocation workqueue never gets
run, and everything grinds to a halt.

To be honest, it's not clear to me how the workqueue
subsystem manages this sort of thing.  But in testing,
making the allocation workqueue high priority so that
it gets added to the front of the pending work list,
resolves the problem.  We did similar things for
the xfs-log workqueues, for similar reasons.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---

(slight whitespace shift is to fit in 80 cols, sorry!)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e5bdca9..9c549e1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -874,7 +874,7 @@ xfs_init_mount_workqueues(
 		goto out_destroy_log;

 	mp->m_alloc_workqueue = alloc_workqueue("xfs-alloc/%s",
-			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+		       WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
 	if (!mp->m_alloc_workqueue)
 		goto out_destroy_eofblocks;

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-09 18:12 ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Eric Sandeen
@ 2015-01-09 18:23   ` Tejun Heo
  2015-01-09 20:36     ` Eric Sandeen
  2015-01-09 23:28     ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Dave Chinner
  0 siblings, 2 replies; 25+ messages in thread
From: Tejun Heo @ 2015-01-09 18:23 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello, Eric.

On Fri, Jan 09, 2015 at 12:12:04PM -0600, Eric Sandeen wrote:
> I had a case reported where a system under high stress
> got deadlocked.  A btree split was handed off to the xfs
> allocation workqueue, and it is holding the xfs_ilock
> exclusively.  However, other xfs_end_io workers are
> not running, because they are waiting for that lock.
> As a result, the xfs allocation workqueue never gets
> run, and everything grinds to a halt.

I'm having a difficult time following the exact deadlock.  Can you
please elaborate in more detail?

> To be honest, it's not clear to me how the workqueue
> subsystem manages this sort of thing.  But in testing,
> making the allocation workqueue high priority so that
> it gets added to the front of the pending work list,
> resolves the problem.  We did similar things for
> the xfs-log workqueues, for similar reasons.

Ummm, this feel pretty voodoo.  In practice, it'd change the order of
things being executed and may make certain deadlocks unlikely enough,
but I don't think this can be a proper fix.

> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e5bdca9..9c549e1 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -874,7 +874,7 @@ xfs_init_mount_workqueues(
>  		goto out_destroy_log;
>  
>  	mp->m_alloc_workqueue = alloc_workqueue("xfs-alloc/%s",
> -			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
> +		       WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);

And this at least deserves way more explanation.

>  	if (!mp->m_alloc_workqueue)
>  		goto out_destroy_eofblocks;
>  
> 

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-09 18:23   ` Tejun Heo
@ 2015-01-09 20:36     ` Eric Sandeen
  2015-01-10 19:28       ` Tejun Heo
  2015-01-09 23:28     ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Dave Chinner
  1 sibling, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-09 20:36 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/9/15 12:23 PM, Tejun Heo wrote:
> Hello, Eric.
> 
> On Fri, Jan 09, 2015 at 12:12:04PM -0600, Eric Sandeen wrote:
>> I had a case reported where a system under high stress
>> got deadlocked.  A btree split was handed off to the xfs
>> allocation workqueue, and it is holding the xfs_ilock
>> exclusively.  However, other xfs_end_io workers are
>> not running, because they are waiting for that lock.
>> As a result, the xfs allocation workqueue never gets
>> run, and everything grinds to a halt.
> 
> I'm having a difficult time following the exact deadlock.  Can you
> please elaborate in more detail?

Sure, sorry.  This is cut & paste from a bug, so hopefully not too disjoint.

And, FWIW, this is a RHEL7 kernel, so not most recent upstream.  I could
possibly ask the reporter to retest upstream; sadly they are using
proprietary tests.

There are many stuck threads; the oldest is this:

crash> bt 39292
PID: 39292  TASK: c000000038240000  CPU: 27  COMMAND: "aio-stress"
 #0 [c00000003ad02a40] __switch_to at c0000000000164d8
 #1 [c00000003ad02c10] __switch_to at c0000000000164d8
 #2 [c00000003ad02c70] __schedule at c000000000900200
 #3 [c00000003ad02ec0] schedule_timeout at c0000000008fd018
 #4 [c00000003ad02fc0] wait_for_completion at c000000000900e08
 #5 [c00000003ad03040] xfs_btree_split at d000000005c58ec0 [xfs]
 #6 [c00000003ad03100] xfs_btree_make_block_unfull at d000000005c5bd5c [xfs]
 #7 [c00000003ad03190] xfs_btree_insrec at d000000005c5c214 [xfs]
 #8 [c00000003ad03290] xfs_btree_insert at d000000005c5c51c [xfs]
 #9 [c00000003ad03320] xfs_bmap_add_extent_hole_real at d000000005c4a630 [xfs]
#10 [c00000003ad033d0] xfs_bmapi_write at d000000005c51f08 [xfs]
#11 [c00000003ad03570] xfs_iomap_write_direct at d000000005c268fc [xfs]
#12 [c00000003ad03660] __xfs_get_blocks at d000000005c0b2b8 [xfs]
#13 [c00000003ad03710] do_direct_IO at c00000000033f6c4
#14 [c00000003ad03830] __blockdev_direct_IO at c000000000341428
#15 [c00000003ad03a20] xfs_vm_direct_IO at d000000005c09d18 [xfs]
#16 [c00000003ad03af0] generic_file_direct_write at c00000000021d1c0
#17 [c00000003ad03ba0] xfs_file_dio_aio_write at d000000005cb3154 [xfs]
#18 [c00000003ad03c40] xfs_file_aio_write at d000000005c1beb0 [xfs]
#19 [c00000003ad03cc0] do_io_submit at c000000000359c00

And the xfsalloc / btree split worker is not on any active task.
Digging further, it's on a queue, but not started.

The work queue which contains it is this:

crash> list -H 0xc000000001cd7b18 -o work_struct.entry -s work_struct
c0000006182ceed8
struct work_struct {
  data = {
    counter = -4611686018397073147
  }, 
  entry = {
    next = 0xc0000006182c31f8, 
    prev = 0xc000000001cd7b18
  }, 
  func = 0xd000000005c0b570 <xfs_end_io>
}
c0000006182c31f0
struct work_struct {
  data = {
    counter = -4611686018397073147
  }, 
  entry = {
    next = 0xc0000006182c0750, 
    prev = 0xc0000006182ceee0
  }, 
  func = 0xd000000005c0b570 <xfs_end_io>
}
c0000006182c0748
struct work_struct {
  data = {
    counter = -4611686018397073147
  }, 
  entry = {
    next = 0xc00000003ad030a8, 
    prev = 0xc0000006182c31f8
  }, 
  func = 0xd000000005c0b570 <xfs_end_io>
}
c00000003ad030a0
struct work_struct {
  data = {
    counter = -4611686018397087739
  }, 
  entry = {
    next = 0xc0000006182c5e08, 
    prev = 0xc0000006182c0750
  }, 
  func = 0xd000000005c58ef0 <xfs_btree_split_worker>
}

...

so that's our stuck split worker, behind xfs_end_io's.

if we look at the oldest stuck aio-stress task waiting for the work item to complete, there is an inode in its stack frames:

crash> xfs_inode c0000000b1dca800

and we find that it has an i_lock held, with waiters:

  i_lock = {
    mr_lock = {
      count = -8589934591, 
      wait_lock = {
        raw_lock = {
          slock = 0
        }
      }, 
      wait_list = {
        next = 0xc00000005a003a20, 
        prev = 0xc00000003f483a20
      }
    }
  }, 

If we ask crash about those waiters, we see they are other kworker threads:

crash> kmem 0xc00000005a003a20
...
    PID: 50959
COMMAND: "kworker/27:2"
...
crash> kmem 0xc00000003f483a20
...
    PID: 29528
COMMAND: "kworker/27:10"
...

and pid 50959 is indeed the 2nd-to-last listed uninterruptable sleep pid.

crash> bt 50959
...
 #3 [c00000005a003a00] rwsem_down_write_failed at c000000000903224
 #4 [c00000005a003a90] down_write at c0000000008ff68c
 #5 [c00000005a003ac0] xfs_ilock at d000000005c78ea8 [xfs]
 #6 [c00000005a003b00] xfs_iomap_write_unwritten at d000000005c272dc [xfs]
 #7 [c00000005a003c20] xfs_end_io at d000000005c0b680 [xfs]
 #8 [c00000005a003c50] process_one_work at c0000000000ecadc

doing bt -FF on that yields:

 #5 [c00000005a003ac0] xfs_ilock at d000000005c78ea8 [xfs]
    c00000005a003ac0: [c00000005a003b00:kmalloc-16384] [c0000005e6fb2000:kmalloc-2048] 
    c00000005a003ad0: xfs_ilock+232    xfs_table_header+19848 
    c00000005a003ae0: 0000000000000000 0000000000006970 
    c00000005a003af0: 0000000000000008 [c0000000b1dca800:xfs_inode]

which is the same inode, c0000000b1dca800

The oldest stuck aio-stress process is (again) down this path:

crash> bt 39292
PID: 39292  TASK: c000000038240000  CPU: 27  COMMAND: "aio-stress"
...
 #4 [c00000003ad02fc0] wait_for_completion at c000000000900e08
 #5 [c00000003ad03040] xfs_btree_split at d000000005c58ec0 [xfs]
 #6 [c00000003ad03100] xfs_btree_make_block_unfull at d000000005c5bd5c [xfs]
 #7 [c00000003ad03190] xfs_btree_insrec at d000000005c5c214 [xfs]
 #8 [c00000003ad03290] xfs_btree_insert at d000000005c5c51c [xfs]
 #9 [c00000003ad03320] xfs_bmap_add_extent_hole_real at d000000005c4a630 [xfs]
#10 [c00000003ad033d0] xfs_bmapi_write at d000000005c51f08 [xfs]
#11 [c00000003ad03570] xfs_iomap_write_direct at d000000005c268fc [xfs]
#12 [c00000003ad03660] __xfs_get_blocks at d000000005c0b2b8 [xfs]
#13 [c00000003ad03710] do_direct_IO at c00000000033f6c4
#14 [c00000003ad03830] __blockdev_direct_IO at c000000000341428

and xfs_iomap_write_direct() takes the ilock exclusively.

        xfs_ilock(ip, XFS_ILOCK_EXCL);

before calling xfs_bmapi_write(), so it must be the holder.  Until this work item runs, everything else working on this inode is stuck, but it's not getting run, behind other items waiting for the lock it holds.

-Eric

>> To be honest, it's not clear to me how the workqueue
>> subsystem manages this sort of thing.  But in testing,
>> making the allocation workqueue high priority so that
>> it gets added to the front of the pending work list,
>> resolves the problem.  We did similar things for
>> the xfs-log workqueues, for similar reasons.
> 
> Ummm, this feel pretty voodoo.  In practice, it'd change the order of
> things being executed and may make certain deadlocks unlikely enough,
> but I don't think this can be a proper fix.
> 
>> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
>> index e5bdca9..9c549e1 100644
>> --- a/fs/xfs/xfs_super.c
>> +++ b/fs/xfs/xfs_super.c
>> @@ -874,7 +874,7 @@ xfs_init_mount_workqueues(
>>  		goto out_destroy_log;
>>  
>>  	mp->m_alloc_workqueue = alloc_workqueue("xfs-alloc/%s",
>> -			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
>> +		       WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
> 
> And this at least deserves way more explanation.
> 
>>  	if (!mp->m_alloc_workqueue)
>>  		goto out_destroy_eofblocks;
>>  
>>
> 
> Thanks.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-09 18:23   ` Tejun Heo
  2015-01-09 20:36     ` Eric Sandeen
@ 2015-01-09 23:28     ` Dave Chinner
  2015-01-10 17:41       ` Tejun Heo
  1 sibling, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2015-01-09 23:28 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, Eric Sandeen, xfs-oss

On Fri, Jan 09, 2015 at 01:23:10PM -0500, Tejun Heo wrote:
> Hello, Eric.
> 
> On Fri, Jan 09, 2015 at 12:12:04PM -0600, Eric Sandeen wrote:
> > I had a case reported where a system under high stress
> > got deadlocked.  A btree split was handed off to the xfs
> > allocation workqueue, and it is holding the xfs_ilock
> > exclusively.  However, other xfs_end_io workers are
> > not running, because they are waiting for that lock.
> > As a result, the xfs allocation workqueue never gets
> > run, and everything grinds to a halt.
> 
> I'm having a difficult time following the exact deadlock.  Can you
> please elaborate in more detail?

process A			kworker (1..N)
ilock(excl)
alloc
  queue work(allocwq)
    (work queued as no kworker
     threads available)
     				execute work from xfsbuf-wq
     				xfs_end_io
				  ilock(excl)
				    (blocks waiting on queued work)

No new kworkers are started, so the queue never makes progress,
we deadlock.

AFAICT, the only way we can get here is that we have N idle
kworkers, and N+M works get queued where the allocwq work is at the
tail end of the queue. This results in newly queued work is not
kicking a new kworker threadi as there are idle threads, and as
works are executed the are all for the xfsbuf-wq and blocking on the
ilock(excl).

We eventually get to the point where there are no more idle
kworkers, but we still have works queued, and progress is still
dependent the queued works completing....

This is actually not an uncommon queuing occurrence, because we
can get storms of end-io works queued from batched IO completion
processing.

> > To be honest, it's not clear to me how the workqueue
> > subsystem manages this sort of thing.  But in testing,
> > making the allocation workqueue high priority so that
> > it gets added to the front of the pending work list,
> > resolves the problem.  We did similar things for
> > the xfs-log workqueues, for similar reasons.
> 
> Ummm, this feel pretty voodoo.  In practice, it'd change the order of
> things being executed and may make certain deadlocks unlikely enough,
> but I don't think this can be a proper fix.

Right, that's why Eric approached about this a few weeks ago asking
whether it could be fixed in the workqueue code.

As I've said before (in years gone by), we've got multiple levels of
priority needed for executing XFS works because of lock ordering
requirements. We *always* want the allocation workqueue work to run
before the end-io processing of the xfsbuf-wq and unwritten-wq
because of this lock inversion, just like we we always want the
xfsbufd to run before the unwritten-wq because unwritten extent
conversion may block waiting for metadata buffer IO to complete, and
we always want the the xfslog-wq works to run before all of them
because metadata buffer IO may get blocked waiting for buffers
pinned by the log to be unpinned for log Io completion...

We solve these dependencies in a sane manner with a single high
priority workqueue level, so we're stuck with hacking around the
worst of the problems for the moment.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-09 23:28     ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Dave Chinner
@ 2015-01-10 17:41       ` Tejun Heo
  2015-01-12  3:30         ` Dave Chinner
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-10 17:41 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Eric Sandeen, Eric Sandeen, xfs-oss

Hello, Dave.

On Sat, Jan 10, 2015 at 10:28:15AM +1100, Dave Chinner wrote:
> process A			kworker (1..N)
> ilock(excl)
> alloc
>   queue work(allocwq)
>     (work queued as no kworker
>      threads available)
>      				execute work from xfsbuf-wq
>      				xfs_end_io
> 				  ilock(excl)
> 				    (blocks waiting on queued work)
> 
> No new kworkers are started, so the queue never makes progress,
> we deadlock.

But allocwq is a separate workqueue from xfsbuf-wq and should have its
own rescuer.  The work item queued by process on A is guaranteed to
make forward progress no matter what work items on xfsbuf-wq are
doing.  The deadlock as depicted above cannot happen.  A workqueue
with WQ_MEM_RECLAIM can deadlock iff an executing work item on the
workqueue deadlocks.

> AFAICT, the only way we can get here is that we have N idle
> kworkers, and N+M works get queued where the allocwq work is at the
> tail end of the queue. This results in newly queued work is not
> kicking a new kworker threadi as there are idle threads, and as
> works are executed the are all for the xfsbuf-wq and blocking on the
> ilock(excl).

The workqueue doesn't work that way.  Forward progress guarantee is
separate for each workqueue with WQ_MEM_RECLAIM set.  Each is
guaranteed to make forward progress indepdently of what others are
doing.

> We eventually get to the point where there are no more idle
> kworkers, but we still have works queued, and progress is still
> dependent the queued works completing....
> 
> This is actually not an uncommon queuing occurrence, because we
> can get storms of end-io works queued from batched IO completion
> processing.

And all these have nothing to with why the deadlock is happening.

> > Ummm, this feel pretty voodoo.  In practice, it'd change the order of
> > things being executed and may make certain deadlocks unlikely enough,
> > but I don't think this can be a proper fix.
> 
> Right, that's why Eric approached about this a few weeks ago asking
> whether it could be fixed in the workqueue code.
> 
> As I've said before (in years gone by), we've got multiple levels of
> priority needed for executing XFS works because of lock ordering
> requirements. We *always* want the allocation workqueue work to run

There's no problem there - create a separate WQ_MEM_RECLAIM workqueue
at each priority level.  Each is guaranteed to make forward progress
and as long as the dependency graph isn't looped, the whole thing is
guaranteed to make forward progress.

> before the end-io processing of the xfsbuf-wq and unwritten-wq
> because of this lock inversion, just like we we always want the
> xfsbufd to run before the unwritten-wq because unwritten extent
> conversion may block waiting for metadata buffer IO to complete, and
> we always want the the xfslog-wq works to run before all of them
> because metadata buffer IO may get blocked waiting for buffers
> pinned by the log to be unpinned for log Io completion...

I'm not really following your logic here.  Are you saying that xfs is
trying to work around cyclic dependency by manipulating execution
order of specific work items?

> We solve these dependencies in a sane manner with a single high
> priority workqueue level, so we're stuck with hacking around the
> worst of the problems for the moment.

I'm afraid I'm lost.  Sans bugs in the rescuer logic, each workqueue
with WQ_MEM_RECLAIM guarantees forward progress of a single work item
at any given time.  If there are dependencies among the work items,
each node of the dependency graph should correspond to a separate
workqueue.  As long as the dependency graph isn't cyclic, the whole is
guaranteed to make forward progress.

There no reason to play with priorities to avoid deadlock.  That
doesn't make any sense to me.  Priority or chained queueing, which is
a lot better way to do it, can be used to optimize queueing and
execution behavior if one set of work items are likely to wait on
another set, but that should have *NOTHING* to do with deadlock
avoidance.

If the dependency graph is cyclic, it will deadlock.  If not, it
won't.  It's as simple as that.

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-09 20:36     ` Eric Sandeen
@ 2015-01-10 19:28       ` Tejun Heo
  2015-01-11  0:04         ` Eric Sandeen
  2015-01-12 20:09         ` Eric Sandeen
  0 siblings, 2 replies; 25+ messages in thread
From: Tejun Heo @ 2015-01-10 19:28 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello, Eric.

On Fri, Jan 09, 2015 at 02:36:28PM -0600, Eric Sandeen wrote:
> And the xfsalloc / btree split worker is not on any active task.
> Digging further, it's on a queue, but not started.
> 
> The work queue which contains it is this:
> 
> crash> list -H 0xc000000001cd7b18 -o work_struct.entry -s work_struct
> c0000006182ceed8
> struct work_struct {
...
>   func = 0xd000000005c0b570 <xfs_end_io>
> }
> c0000006182c31f0
> struct work_struct {
...
>   func = 0xd000000005c0b570 <xfs_end_io>
> }
> c0000006182c0748
> struct work_struct {
...
>   func = 0xd000000005c0b570 <xfs_end_io>
> }
> c00000003ad030a0
> struct work_struct {
...
>   func = 0xd000000005c58ef0 <xfs_btree_split_worker>
> }
> 
> ...
> 
> so that's our stuck split worker, behind xfs_end_io's.

As long as the split worker is queued on a separate workqueue, it's
not really stuck behind xfs_end_io's.  If the global pool that the
work item is queued on can't make forward progress due to memory
pressure, the rescuer will be summoned and it will pick out that work
item and execute it.

The only reasons that work item would stay there are

* The rescuer is already executing something else from that workqueue
  and that one is stuck.

* The worker pool is still considered to be making forward progress -
  there's a worker which isn't blocked and can burn CPU cycles.
  ie. if you have a busy spinning work item on the per-cpu workqueue,
  it can stall progress.

...
> and xfs_iomap_write_direct() takes the ilock exclusively.
> 
>         xfs_ilock(ip, XFS_ILOCK_EXCL);
> 
> before calling xfs_bmapi_write(), so it must be the holder.  Until
> this work item runs, everything else working on this inode is stuck,
> but it's not getting run, behind other items waiting for the lock it
> holds.

Again, if xfs is using workqueue correctly, that work item shouldn't
get stuck at all.  What other workqueues are doing is irrelevant.

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-10 19:28       ` Tejun Heo
@ 2015-01-11  0:04         ` Eric Sandeen
  2015-01-11  6:33           ` Tejun Heo
  2015-01-12 20:09         ` Eric Sandeen
  1 sibling, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-11  0:04 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/10/15 1:28 PM, Tejun Heo wrote:
> Hello, Eric.


> As long as the split worker is queued on a separate workqueue, it's
> not really stuck behind xfs_end_io's.  If the global pool that the
> work item is queued on can't make forward progress due to memory
> pressure, the rescuer will be summoned and it will pick out that work
> item and execute it.

Ok, but that's not happening...

> The only reasons that work item would stay there are
> 
> * The rescuer is already executing something else from that workqueue
>   and that one is stuck.

I'll have to look at that.  I hope I still have access to the core...

> * The worker pool is still considered to be making forward progress -
>   there's a worker which isn't blocked and can burn CPU cycles.

AFAICT, the first thing in the pool is the xffs_end_io blocked waiting for the ilock.

I assume it's only the first one that matters?

>   ie. if you have a busy spinning work item on the per-cpu workqueue,
>   it can stall progress.

...

> Again, if xfs is using workqueue correctly, that work item shouldn't
> get stuck at all.  What other workqueues are doing is irrelevant.

and yet here we are; one of us must be missing something.  It's quite
possibly me :) but we definitely have this thing wedged, and moving
the xfsalloc item to the front via high priority did solve it.  Not saying
it's the right solution, just a data point.

Thanks,
-Eric
 
> Thanks.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-11  0:04         ` Eric Sandeen
@ 2015-01-11  6:33           ` Tejun Heo
  0 siblings, 0 replies; 25+ messages in thread
From: Tejun Heo @ 2015-01-11  6:33 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello,

On Sat, Jan 10, 2015 at 06:04:30PM -0600, Eric Sandeen wrote:
> > The only reasons that work item would stay there are
> > 
> > * The rescuer is already executing something else from that workqueue
> >   and that one is stuck.
> 
> I'll have to look at that.  I hope I still have access to the core...

Yes, if this is happening, the rescuer worker which has the name of
the workqueue would be stuck somewhere.

> > * The worker pool is still considered to be making forward progress -
> >   there's a worker which isn't blocked and can burn CPU cycles.
> 
> AFAICT, the first thing in the pool is the xffs_end_io blocked waiting for the ilock.
> 
> I assume it's only the first one that matters?

Whatever work item which is executing on that pool on that CPU.
Checking the tasks which are runnable on that CPU should show it.

> > Again, if xfs is using workqueue correctly, that work item shouldn't
> > get stuck at all.  What other workqueues are doing is irrelevant.
> 
> and yet here we are; one of us must be missing something.  It's quite
> possibly me :) but we definitely have this thing wedged, and moving
> the xfsalloc item to the front via high priority did solve it.  Not saying
> it's the right solution, just a data point.

It sure is possible that workqueue is misbehaving but I'm pretty
doubtful that it'd be, especially given that xfs issue has been around
for quite a while, which excludes recent regressions in the rescuer
logic, and that there hasn't been any other case of failed forward
progress guarantee.

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-10 17:41       ` Tejun Heo
@ 2015-01-12  3:30         ` Dave Chinner
  2015-01-13 20:50           ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2015-01-12  3:30 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, Eric Sandeen, xfs-oss

On Sat, Jan 10, 2015 at 12:41:33PM -0500, Tejun Heo wrote:
> Hello, Dave.
> 
> On Sat, Jan 10, 2015 at 10:28:15AM +1100, Dave Chinner wrote:
> > process A			kworker (1..N)
> > ilock(excl)
> > alloc
> >   queue work(allocwq)
> >     (work queued as no kworker
> >      threads available)
> >      				execute work from xfsbuf-wq
> >      				xfs_end_io
> > 				  ilock(excl)
> > 				    (blocks waiting on queued work)
> > 
> > No new kworkers are started, so the queue never makes progress,
> > we deadlock.
> 
> But allocwq is a separate workqueue from xfsbuf-wq and should have its
> own rescuer.  The work item queued by process on A is guaranteed to
> make forward progress no matter what work items on xfsbuf-wq are
> doing.  The deadlock as depicted above cannot happen.  A workqueue
> with WQ_MEM_RECLAIM can deadlock iff an executing work item on the
> workqueue deadlocks.

Eric will have to confirm, but I recall asking Eric to check the
recuer threads and that they were idle...

....

> > before the end-io processing of the xfsbuf-wq and unwritten-wq
> > because of this lock inversion, just like we we always want the
> > xfsbufd to run before the unwritten-wq because unwritten extent
> > conversion may block waiting for metadata buffer IO to complete, and
> > we always want the the xfslog-wq works to run before all of them
> > because metadata buffer IO may get blocked waiting for buffers
> > pinned by the log to be unpinned for log Io completion...
> 
> I'm not really following your logic here.  Are you saying that xfs is
> trying to work around cyclic dependency by manipulating execution
> order of specific work items?

No, it's not cyclic. They are different dependencies.

Data IO completion can take the XFS inode i_lock. i.e. in the
mp->m_data_workqueue and the mp->m_unwritten_workqueue.

mp->m_data_workqueue has no other dependencies.

mp->m_unwritten_workqueue reads buffers, so is dependent on
mp->m_buf_workqueue for buffer IO completion.

mp->m_unwritten_workqueue can cause btree splits, which can defer
work to the xfs_alloc_wq.

xfs_alloc_wq reads buffers, so it dependent on the
mp->m_buf_workqueue for buffer IO completion.

So lock/wq ordering dependencies are:

m_data_workqueue -> i_lock
m_unwritten_workqueue -> i_lock -> xfs_alloc_wq -> m_buf_workqueue
syscall -> i_lock -> xfs_alloc_wq -> m_buf_workqueue

The issue we see is:

process A:	write(2) -> i_lock -> xfs_allow_wq
kworkers:	m_data_workqueue -> i_lock
		(blocked on process A work completion)

Queued work:  m_data_workqueue work, xfs_allow_wq work

Queued work does not appear to be dispatched for some reason, wq
concurrency depth does not appear to be exhausted and rescuer
threads do not appear to be active. Something has gone wrong for
the queued work to be stalled like this.

> There no reason to play with priorities to avoid deadlock.  That
> doesn't make any sense to me.  Priority or chained queueing, which is

Prioritised work queuing is what I suggested, not modifying kworker
scheduler priorities...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] xfs: make global xfsalloc workqueue per-mount
  2015-01-09 18:10 ` [PATCH 1/2] xfs: make global xfsalloc workqueue per-mount Eric Sandeen
@ 2015-01-12 15:35   ` Brian Foster
  0 siblings, 0 replies; 25+ messages in thread
From: Brian Foster @ 2015-01-12 15:35 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

On Fri, Jan 09, 2015 at 12:10:45PM -0600, Eric Sandeen wrote:
> The xfsalloc workquaue is the last remaining global work
> queue, something we've been moving away from in general.
> Make this one per-mount like every other workqueue.
> 
> This also renames it from "xfsalloc" to "xfs-alloc" to
> match the naming convention.
> 
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> ---

Looks good to me regardless of the ongoing discussion on the queue
priority:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index a6fbf44..72d28d1 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -36,8 +36,6 @@
>  #include "xfs_buf_item.h"
>  #include "xfs_log.h"
>  
> -struct workqueue_struct *xfs_alloc_wq;
> -
>  #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
>  
>  #define	XFSA_FIXUP_BNO_OK	1
> diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
> index d1b4b6a..39933a1 100644
> --- a/fs/xfs/libxfs/xfs_alloc.h
> +++ b/fs/xfs/libxfs/xfs_alloc.h
> @@ -24,8 +24,6 @@ struct xfs_mount;
>  struct xfs_perag;
>  struct xfs_trans;
>  
> -extern struct workqueue_struct *xfs_alloc_wq;
> -
>  /*
>   * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
>   */
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index 81cad43..a8a1369 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -2574,7 +2574,7 @@ xfs_btree_split(
>  	args.done = &done;
>  	args.kswapd = current_is_kswapd();
>  	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
> -	queue_work(xfs_alloc_wq, &args.work);
> +	queue_work(cur->bc_mp->m_alloc_workqueue, &args.work);
>  	wait_for_completion(&done);
>  	destroy_work_on_stack(&args.work);
>  	return args.result;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 22ccf69..7d57a5f 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -175,6 +175,7 @@ typedef struct xfs_mount {
>  	struct workqueue_struct	*m_reclaim_workqueue;
>  	struct workqueue_struct	*m_log_workqueue;
>  	struct workqueue_struct *m_eofblocks_workqueue;
> +	struct workqueue_struct *m_alloc_workqueue;
>  } xfs_mount_t;
>  
>  /*
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 19cbda1..e5bdca9 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -873,8 +873,15 @@ xfs_init_mount_workqueues(
>  	if (!mp->m_eofblocks_workqueue)
>  		goto out_destroy_log;
>  
> +	mp->m_alloc_workqueue = alloc_workqueue("xfs-alloc/%s",
> +			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
> +	if (!mp->m_alloc_workqueue)
> +		goto out_destroy_eofblocks;
> +
>  	return 0;
>  
> +out_destroy_eofblocks:
> +	destroy_workqueue(mp->m_eofblocks_workqueue);
>  out_destroy_log:
>  	destroy_workqueue(mp->m_log_workqueue);
>  out_destroy_reclaim:
> @@ -895,6 +902,7 @@ STATIC void
>  xfs_destroy_mount_workqueues(
>  	struct xfs_mount	*mp)
>  {
> +	destroy_workqueue(mp->m_alloc_workqueue);
>  	destroy_workqueue(mp->m_eofblocks_workqueue);
>  	destroy_workqueue(mp->m_log_workqueue);
>  	destroy_workqueue(mp->m_reclaim_workqueue);
> @@ -1717,29 +1725,6 @@ xfs_destroy_zones(void)
>  }
>  
>  STATIC int __init
> -xfs_init_workqueues(void)
> -{
> -	/*
> -	 * The allocation workqueue can be used in memory reclaim situations
> -	 * (writepage path), and parallelism is only limited by the number of
> -	 * AGs in all the filesystems mounted. Hence use the default large
> -	 * max_active value for this workqueue.
> -	 */
> -	xfs_alloc_wq = alloc_workqueue("xfsalloc",
> -			WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
> -	if (!xfs_alloc_wq)
> -		return -ENOMEM;
> -
> -	return 0;
> -}
> -
> -STATIC void
> -xfs_destroy_workqueues(void)
> -{
> -	destroy_workqueue(xfs_alloc_wq);
> -}
> -
> -STATIC int __init
>  init_xfs_fs(void)
>  {
>  	int			error;
> @@ -1753,13 +1738,9 @@ init_xfs_fs(void)
>  	if (error)
>  		goto out;
>  
> -	error = xfs_init_workqueues();
> -	if (error)
> -		goto out_destroy_zones;
> -
>  	error = xfs_mru_cache_init();
>  	if (error)
> -		goto out_destroy_wq;
> +		goto out_destroy_zones;
>  
>  	error = xfs_buf_init();
>  	if (error)
> @@ -1776,7 +1757,7 @@ init_xfs_fs(void)
>  	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
>  	if (!xfs_kset) {
>  		error = -ENOMEM;
> -		goto out_sysctl_unregister;;
> +		goto out_sysctl_unregister;
>  	}
>  
>  #ifdef DEBUG
> @@ -1795,27 +1776,25 @@ init_xfs_fs(void)
>  		goto out_qm_exit;
>  	return 0;
>  
> - out_qm_exit:
> +out_qm_exit:
>  	xfs_qm_exit();
> - out_remove_kobj:
> +out_remove_kobj:
>  #ifdef DEBUG
>  	xfs_sysfs_del(&xfs_dbg_kobj);
> - out_kset_unregister:
> +out_kset_unregister:
>  #endif
>  	kset_unregister(xfs_kset);
> - out_sysctl_unregister:
> +out_sysctl_unregister:
>  	xfs_sysctl_unregister();
> - out_cleanup_procfs:
> +out_cleanup_procfs:
>  	xfs_cleanup_procfs();
> - out_buf_terminate:
> +out_buf_terminate:
>  	xfs_buf_terminate();
> - out_mru_cache_uninit:
> +out_mru_cache_uninit:
>  	xfs_mru_cache_uninit();
> - out_destroy_wq:
> -	xfs_destroy_workqueues();
> - out_destroy_zones:
> +out_destroy_zones:
>  	xfs_destroy_zones();
> - out:
> +out:
>  	return error;
>  }
>  
> @@ -1832,7 +1811,6 @@ exit_xfs_fs(void)
>  	xfs_cleanup_procfs();
>  	xfs_buf_terminate();
>  	xfs_mru_cache_uninit();
> -	xfs_destroy_workqueues();
>  	xfs_destroy_zones();
>  }
>  
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-10 19:28       ` Tejun Heo
  2015-01-11  0:04         ` Eric Sandeen
@ 2015-01-12 20:09         ` Eric Sandeen
  2015-01-12 22:53           ` Tejun Heo
  1 sibling, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-12 20:09 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/10/15 1:28 PM, Tejun Heo wrote:
> Hello, Eric.
> 
> On Fri, Jan 09, 2015 at 02:36:28PM -0600, Eric Sandeen wrote:

...

> As long as the split worker is queued on a separate workqueue, it's
> not really stuck behind xfs_end_io's.  If the global pool that the
> work item is queued on can't make forward progress due to memory
> pressure, the rescuer will be summoned and it will pick out that work
> item and execute it.
> 
> The only reasons that work item would stay there are
> 
> * The rescuer is already executing something else from that workqueue
>   and that one is stuck.

That does not seem to be the case:

PID: 2563   TASK: c00000060f101370  CPU: 33  COMMAND: "xfsalloc"
 #0 [c000000602787850] __switch_to at c0000000000164d8
 #1 [c000000602787a20] __switch_to at c0000000000164d8
 #2 [c000000602787a80] __schedule at c000000000900200
 #3 [c000000602787cd0] rescuer_thread at c0000000000ed770
 #4 [c000000602787d80] kthread at c0000000000f8e0c
 #5 [c000000602787e30] ret_from_kernel_thread at c00000000000a3e8

> * The worker pool is still considered to be making forward progress -
>   there's a worker which isn't blocked and can burn CPU cycles.
>   ie. if you have a busy spinning work item on the per-cpu workqueue,
>   it can stall progress.

So, the only interesting runnable task I see is this:

crash> bt 17056
PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
 #0 [c000000060b83190] hardware_interrupt_common at c000000000002294
 Hardware Interrupt  [501] exception frame:
 R0:  c00000000090392c    R1:  c000000060b83480    R2:  c0000000010adb68   
 R3:  0000000000000500    R4:  0000000000000001    R5:  0000000000000001   
 R6:  00032e4d45dc10ff    R7:  0000000000ba0000    R8:  0000000000000004   
 R9:  000000000000002b    R10: c0000002cacc0d88    R11: 0000000000000001   
 R12: d000000005c0bef0    R13: c000000007df1c00   
 NIP: c000000000010880    MSR: 8000000100009033    OR3: c00000000047e1cc
 CTR: 0000000000000001    LR:  c000000000010880    XER: 0000000020000000
 CCR: 00000000220c2044    MQ:  0000000000000001    DAR: 8000000100009033
 DSISR: c0000000009544d0     Syscall Result: 0000000000000000

 #1 [c000000060b83480] arch_local_irq_restore at c000000000010880  (unreliable)
 #2 [c000000060b834a0] _raw_spin_unlock_irqrestore at c00000000090392c
 #3 [c000000060b834c0] redirty_page_for_writepage at c000000000230b7c
 #4 [c000000060b83510] xfs_vm_writepage at d000000005c0bfc0 [xfs]
 #5 [c000000060b835f0] write_cache_pages.constprop.10 at c000000000230688
 #6 [c000000060b83730] generic_writepages at c000000000230a00
 #7 [c000000060b837b0] xfs_vm_writepages at d000000005c0a658 [xfs]
 #8 [c000000060b837f0] do_writepages at c0000000002324f0
 #9 [c000000060b83860] __writeback_single_inode at c00000000031eff0
#10 [c000000060b838b0] writeback_sb_inodes at c000000000320e68
#11 [c000000060b839c0] __writeback_inodes_wb at c0000000003212a4
#12 [c000000060b83a30] wb_writeback at c00000000032168c
#13 [c000000060b83b10] bdi_writeback_workfn at c000000000321ea4
#14 [c000000060b83c50] process_one_work at c0000000000ecadc
#15 [c000000060b83cf0] worker_thread at c0000000000ed100
#16 [c000000060b83d80] kthread at c0000000000f8e0c
#17 [c000000060b83e30] ret_from_kernel_thread at c00000000000a3e8

all I have is a snapshot of the system, of course, so I don't know if this
is progressing or not.  But the report is that the system is hung for 
hours (the aio-stress task hasn't run for 1 day, 11:14:39).

Hmmm:

PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
    RUN TIME: 1 days, 11:48:06
  START TIME: 285818
       UTIME: 0
       STIME: 126895310000000

(ok, that's some significant system time ...)

vs

PID: 39292  TASK: c000000038240000  CPU: 27  COMMAND: "aio-stress"
    RUN TIME: 1 days, 11:14:40
  START TIME: 287824
       UTIME: 0
       STIME: 130000000

maybe that is spinning... I'm not quite clear on how to definitively
say whether it's blocking the xfsalloc work from completing...

I'll look more at that writeback thread, but what do you think?

Thanks,
-Eric

> ...
>> and xfs_iomap_write_direct() takes the ilock exclusively.
>>
>>         xfs_ilock(ip, XFS_ILOCK_EXCL);
>>
>> before calling xfs_bmapi_write(), so it must be the holder.  Until
>> this work item runs, everything else working on this inode is stuck,
>> but it's not getting run, behind other items waiting for the lock it
>> holds.
> 
> Again, if xfs is using workqueue correctly, that work item shouldn't
> get stuck at all.  What other workqueues are doing is irrelevant.
> 
> Thanks.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-12 20:09         ` Eric Sandeen
@ 2015-01-12 22:53           ` Tejun Heo
  2015-01-12 23:12             ` Eric Sandeen
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-12 22:53 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello, Eric.

On Mon, Jan 12, 2015 at 02:09:15PM -0600, Eric Sandeen wrote:
> crash> bt 17056
> PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
                                                                 ^
This is an unbound worker which doesn't participate in the concurrency
management, so this can't be the direct source althought it can
definitely be causing something else.

>  #0 [c000000060b83190] hardware_interrupt_common at c000000000002294
>  Hardware Interrupt  [501] exception frame:
...
>  #1 [c000000060b83480] arch_local_irq_restore at c000000000010880  (unreliable)
>  #2 [c000000060b834a0] _raw_spin_unlock_irqrestore at c00000000090392c
>  #3 [c000000060b834c0] redirty_page_for_writepage at c000000000230b7c
>  #4 [c000000060b83510] xfs_vm_writepage at d000000005c0bfc0 [xfs]
>  #5 [c000000060b835f0] write_cache_pages.constprop.10 at c000000000230688
>  #6 [c000000060b83730] generic_writepages at c000000000230a00
>  #7 [c000000060b837b0] xfs_vm_writepages at d000000005c0a658 [xfs]
>  #8 [c000000060b837f0] do_writepages at c0000000002324f0
>  #9 [c000000060b83860] __writeback_single_inode at c00000000031eff0
> #10 [c000000060b838b0] writeback_sb_inodes at c000000000320e68
> #11 [c000000060b839c0] __writeback_inodes_wb at c0000000003212a4
> #12 [c000000060b83a30] wb_writeback at c00000000032168c
> #13 [c000000060b83b10] bdi_writeback_workfn at c000000000321ea4
> #14 [c000000060b83c50] process_one_work at c0000000000ecadc
> #15 [c000000060b83cf0] worker_thread at c0000000000ed100
> #16 [c000000060b83d80] kthread at c0000000000f8e0c
> #17 [c000000060b83e30] ret_from_kernel_thread at c00000000000a3e8
> 
> all I have is a snapshot of the system, of course, so I don't know if this
> is progressing or not.  But the report is that the system is hung for 
> hours (the aio-stress task hasn't run for 1 day, 11:14:39).

I see.

> Hmmm:
> 
> PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
>     RUN TIME: 1 days, 11:48:06

lol, that's some serious cpu burning.

>   START TIME: 285818
>        UTIME: 0
>        STIME: 126895310000000
> 
> (ok, that's some significant system time ...)
> 
> vs
> 
> PID: 39292  TASK: c000000038240000  CPU: 27  COMMAND: "aio-stress"
>     RUN TIME: 1 days, 11:14:40
>   START TIME: 287824
>        UTIME: 0
>        STIME: 130000000
> 
> maybe that is spinning... I'm not quite clear on how to definitively
> say whether it's blocking the xfsalloc work from completing...
> 
> I'll look more at that writeback thread, but what do you think?

This doesn't look like the direct cause.  It could just be reclaim
path going berserk as the filesystem can't writeout pages.  Can you
dump all runnable tasks?  Was this the only runnable kworker?

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-12 22:53           ` Tejun Heo
@ 2015-01-12 23:12             ` Eric Sandeen
  2015-01-12 23:37               ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-12 23:12 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/12/15 4:53 PM, Tejun Heo wrote:
> Hello, Eric.

Hi Tejun :)

> This doesn't look like the direct cause.  It could just be reclaim
> path going berserk as the filesystem can't writeout pages.  Can you
> dump all runnable tasks?  Was this the only runnable kworker?
> 
> Thanks.

The only tasks in RU state are the one I mentioned, and swapper/X.

crash> ps -m | grep RU
[0 00:00:00.159] [RU]  PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
[4 19:11:44.360] [RU]  PID: 0      TASK: c0000000010416f0  CPU: 0   COMMAND: "swapper/0"
[4 19:11:44.192] [RU]  PID: 0      TASK: c000000633280000  CPU: 1   COMMAND: "swapper/1"
[4 19:11:44.192] [RU]  PID: 0      TASK: c0000006332c0000  CPU: 2   COMMAND: "swapper/2"
...

and I don't see anything interesting in those. :)

-Eric


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-12 23:12             ` Eric Sandeen
@ 2015-01-12 23:37               ` Tejun Heo
  2015-01-13 19:08                 ` Eric Sandeen
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-12 23:37 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello, Eric.

On Mon, Jan 12, 2015 at 05:12:34PM -0600, Eric Sandeen wrote:
> The only tasks in RU state are the one I mentioned, and swapper/X.
> 
> crash> ps -m | grep RU
> [0 00:00:00.159] [RU]  PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
> [4 19:11:44.360] [RU]  PID: 0      TASK: c0000000010416f0  CPU: 0   COMMAND: "swapper/0"
> [4 19:11:44.192] [RU]  PID: 0      TASK: c000000633280000  CPU: 1   COMMAND: "swapper/1"
> [4 19:11:44.192] [RU]  PID: 0      TASK: c0000006332c0000  CPU: 2   COMMAND: "swapper/2"
> ...
> 
> and I don't see anything interesting in those. :)

Heh, I don't get it.  Can you please print out worker_pool->nr_running
of the pool that the hung work item is queued on?

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-12 23:37               ` Tejun Heo
@ 2015-01-13 19:08                 ` Eric Sandeen
  2015-01-13 20:19                   ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-13 19:08 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/12/15 5:37 PM, Tejun Heo wrote:
> Hello, Eric.
> 
> On Mon, Jan 12, 2015 at 05:12:34PM -0600, Eric Sandeen wrote:
>> The only tasks in RU state are the one I mentioned, and swapper/X.
>>
>> crash> ps -m | grep RU
>> [0 00:00:00.159] [RU]  PID: 17056  TASK: c000000111cc0000  CPU: 8   COMMAND: "kworker/u112:1"
>> [4 19:11:44.360] [RU]  PID: 0      TASK: c0000000010416f0  CPU: 0   COMMAND: "swapper/0"
>> [4 19:11:44.192] [RU]  PID: 0      TASK: c000000633280000  CPU: 1   COMMAND: "swapper/1"
>> [4 19:11:44.192] [RU]  PID: 0      TASK: c0000006332c0000  CPU: 2   COMMAND: "swapper/2"
>> ...
>>
>> and I don't see anything interesting in those. :)
> 
> Heh, I don't get it.  Can you please print out worker_pool->nr_running
> of the pool that the hung work item is queued on?

crash> struct worker_pool c000000001cd7b00
struct worker_pool {
  lock = {
    {
      rlock = {
        raw_lock = {
          slock = 0
        }
      }
    }
  }, 
  cpu = 27, 
  node = 0, 
  id = 54, 
  flags = 0, 
  worklist = {
    next = 0xc0000006182ceee0, 
    prev = 0xc000000000fe0470 <release_agent_work+8>
  }, 
  nr_workers = 15, 
  nr_idle = 0, 
...

And that worklist is the one w/ my stuck aio-stress btree split
worker:

crash> list -H 0xc000000001cd7b18 -o work_struct.entry -s work_struct
...
c00000003ad030a0
struct work_struct {
  data = {
    counter = -4611686018397087739
  }, 
  entry = {
    next = 0xc0000006182c5e08, 
    prev = 0xc0000006182c0750
  }, 
  func = 0xd000000005c58ef0 <xfs_btree_split_worker>
}
...

-Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-13 19:08                 ` Eric Sandeen
@ 2015-01-13 20:19                   ` Tejun Heo
  2015-01-13 20:29                     ` Eric Sandeen
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-13 20:19 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello, Eric.

On Tue, Jan 13, 2015 at 01:08:27PM -0600, Eric Sandeen wrote:
> crash> struct worker_pool c000000001cd7b00
> struct worker_pool {
>   lock = {
>     {
>       rlock = {
>         raw_lock = {
>           slock = 0
>         }
>       }
>     }
>   }, 
>   cpu = 27, 
>   node = 0, 
>   id = 54, 
>   flags = 0, 
>   worklist = {
>     next = 0xc0000006182ceee0, 
>     prev = 0xc000000000fe0470 <release_agent_work+8>
>   }, 
>   nr_workers = 15, 
>   nr_idle = 0, 

Can you please also report the value of nr_running?  That's what
regulates the kick off of new workers and rescuers.

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-13 20:19                   ` Tejun Heo
@ 2015-01-13 20:29                     ` Eric Sandeen
  2015-01-13 20:46                       ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-13 20:29 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/13/15 2:19 PM, Tejun Heo wrote:
> Hello, Eric.
> 
> On Tue, Jan 13, 2015 at 01:08:27PM -0600, Eric Sandeen wrote:
>> crash> struct worker_pool c000000001cd7b00
>> struct worker_pool {
>>   lock = {
>>     {
>>       rlock = {
>>         raw_lock = {
>>           slock = 0
>>         }
>>       }
>>     }
>>   }, 
>>   cpu = 27, 
>>   node = 0, 
>>   id = 54, 
>>   flags = 0, 
>>   worklist = {
>>     next = 0xc0000006182ceee0, 
>>     prev = 0xc000000000fe0470 <release_agent_work+8>
>>   }, 
>>   nr_workers = 15, 
>>   nr_idle = 0, 
> 
> Can you please also report the value of nr_running?  That's what
> regulates the kick off of new workers and rescuers.

sorry about that, swapped nr_workers w/ nr_running in my brain:

  nr_running = {
    counter = 0
  }, 

-=Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-13 20:29                     ` Eric Sandeen
@ 2015-01-13 20:46                       ` Tejun Heo
  2015-01-13 22:58                         ` Eric Sandeen
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-13 20:46 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello, Eric.

On Tue, Jan 13, 2015 at 02:29:53PM -0600, Eric Sandeen wrote:
> > Can you please also report the value of nr_running?  That's what
> > regulates the kick off of new workers and rescuers.
> 
> sorry about that, swapped nr_workers w/ nr_running in my brain:
> 
>   nr_running = {
>     counter = 0
>   }, 

So,

	nr_workers == 15,
	nr_idle == 0,
	nr_running == 0,

That means one worker must be playing the role of manager by executing
manage_workers() whic his also responsible for kicking off the
rescuers if it fails to create new workers in a short period of time.
The manager is identifier as the holder of pool->manager_arb and while
a manager is trying to creat a worker, pool->mayday_timer must be
armed continuously firing off every MAYDAY_INTERVAL summoning rescuers
to the pool, which should be visible through the pool_pwq->mayday_node
corresponding to the stalled pool being queued on wq->maydays.

Can you post the full dump of the pool, wq and all kworkers?

Thanks.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-12  3:30         ` Dave Chinner
@ 2015-01-13 20:50           ` Tejun Heo
  0 siblings, 0 replies; 25+ messages in thread
From: Tejun Heo @ 2015-01-13 20:50 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Eric Sandeen, Eric Sandeen, xfs-oss

Hello, Dave.

On Mon, Jan 12, 2015 at 02:30:15PM +1100, Dave Chinner wrote:
> So lock/wq ordering dependencies are:
> 
> m_data_workqueue -> i_lock
> m_unwritten_workqueue -> i_lock -> xfs_alloc_wq -> m_buf_workqueue
> syscall -> i_lock -> xfs_alloc_wq -> m_buf_workqueue
> 
> The issue we see is:
> 
> process A:	write(2) -> i_lock -> xfs_allow_wq
> kworkers:	m_data_workqueue -> i_lock
> 		(blocked on process A work completion)
> 
> Queued work:  m_data_workqueue work, xfs_allow_wq work
> 
> Queued work does not appear to be dispatched for some reason, wq
> concurrency depth does not appear to be exhausted and rescuer
> threads do not appear to be active. Something has gone wrong for
> the queued work to be stalled like this.

Yeah, this actually looks like a bug in the rescuer or manager
arbitration logic.  I'm gonna see what's going on once Eric posts more
dumps.

Sorry about the trouble.

-- 
tejun

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority
  2015-01-13 20:46                       ` Tejun Heo
@ 2015-01-13 22:58                         ` Eric Sandeen
  2015-01-13 23:35                           ` [PATCH wq/for-3.19] workqueue: fix subtle pool management issue which can stall whole worker_pool Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Eric Sandeen @ 2015-01-13 22:58 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Eric Sandeen, xfs-oss

On 1/13/15 2:46 PM, Tejun Heo wrote:

> So,
> 
> 	nr_workers == 15,
> 	nr_idle == 0,
> 	nr_running == 0,
> 
> That means one worker must be playing the role of manager by executing
> manage_workers() whic his also responsible for kicking off the
> rescuers if it fails to create new workers in a short period of time.
> The manager is identifier as the holder of pool->manager_arb and while
> a manager is trying to creat a worker, pool->mayday_timer must be
> armed continuously firing off every MAYDAY_INTERVAL summoning rescuers
> to the pool, which should be visible through the pool_pwq->mayday_node
> corresponding to the stalled pool being queued on wq->maydays.
> 
> Can you post the full dump of the pool, wq and all kworkers?
> 
> Thanks.
> 

Just for mailing list archive posterity, Tejun thinks he's found the culprit
in the workqueue code, I or he can follow up again when he has a patch ready
to go.

Thanks Tejun!

-Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH wq/for-3.19] workqueue: fix subtle pool management issue which can stall whole worker_pool
  2015-01-13 22:58                         ` Eric Sandeen
@ 2015-01-13 23:35                           ` Tejun Heo
  2015-01-16 19:32                             ` [PATCH workqueue wq/for-3.19-fixes] " Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-13 23:35 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Eric Sandeen, xfs-oss

Hello,

This is the preliminary patch that I have.  I haven't tested it yet.
Once testing is complete, I'll re-post with test results and apply the
patch.

Thanks and sorry about the trouble.

------ 8< ------
A worker_pool's forward progress is guaranteed by the fact that the
last idle worker assumes the manager role to create more workers and
summon the rescuers if creating workers doesn't succeed in timely
manner before proceeding to execute work items.

This manager role is implemented in manage_workers(), which indicates
whether the worker may proceed to work item execution with its return
value.  This is necessary because multiple workers may contend for the
manager role, and, if there already is a manager, others should
proceed to work item execution.

Unfortunately, the function also indicates that the worker may proceed
to work item execution if need_to_create_worker() is false before
releasing pool->lock.  need_to_create_worker() tests the following
conditions.

	pending work items && !nr_running && !nr_idle

The first and third conditions are protected by pool->lock and thus
won't change while holding pool->lock; however, nr_running can change
asynchronously as other workers block and resume and while it's likely
to be zero, as someone woke this worker up, some other workers could
have become runnable inbetween making it non-zero.

If this happens, manage_worker() could return false even with zero
nr_idle making the worker, the last idle one, proceed to execute work
items.  If then all workers of the pool end up blocking on a resource
which can only be released by a work item which is pending on that
pool, the whole pool can deadlock as there's no one to create more
workers or summon the rescuers.

This patch fixes the problem by making manage_workers() return false
iff there's already another manager, which ensures that the last
worker doesn't start executing work items.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Eric Sandeen <sandeen@sandeen.net>
Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net
Cc: Dave Chinner <david@fromorbit.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c |   25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1841,17 +1841,13 @@ static void pool_mayday_timeout(unsigned
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.  Called only from
  * manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
  */
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
 	if (!need_to_create_worker(pool))
-		return false;
+		return;
 restart:
 	spin_unlock_irq(&pool->lock);

@@ -1877,7 +1873,6 @@ restart:
 	 */
 	if (need_to_create_worker(pool))
 		goto restart;
-	return true;
 }

 /**
@@ -1897,16 +1892,14 @@ restart:
  * multiple times.  Does GFP_KERNEL allocations.
  *
  * Return:
- * %false if the pool don't need management and the caller can safely start
- * processing works, %true indicates that the function released pool->lock
- * and reacquired it to perform some management function and that the
- * conditions that the caller verified while holding the lock before
- * calling the function might no longer be true.
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
  */
 static bool manage_workers(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	bool ret = false;

 	/*
 	 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1912,12 @@ static bool manage_workers(struct worker
 	 * actual management, the pool may stall indefinitely.
 	 */
 	if (!mutex_trylock(&pool->manager_arb))
-		return ret;
+		return false;

-	ret |= maybe_create_worker(pool);
+	maybe_create_worker(pool);

 	mutex_unlock(&pool->manager_arb);
-	return ret;
+	return true;
 }

 /**

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH workqueue wq/for-3.19-fixes] workqueue: fix subtle pool management issue which can stall whole worker_pool
  2015-01-13 23:35                           ` [PATCH wq/for-3.19] workqueue: fix subtle pool management issue which can stall whole worker_pool Tejun Heo
@ 2015-01-16 19:32                             ` Tejun Heo
  2015-01-19  2:15                               ` Lai Jiangshan
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2015-01-16 19:32 UTC (permalink / raw)
  To: Eric Sandeen, Lai Jiangshan; +Cc: Eric Sandeen, linux-kernel, xfs-oss

>From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 Jan 2015 14:21:16 -0500

A worker_pool's forward progress is guaranteed by the fact that the
last idle worker assumes the manager role to create more workers and
summon the rescuers if creating workers doesn't succeed in timely
manner before proceeding to execute work items.

This manager role is implemented in manage_workers(), which indicates
whether the worker may proceed to work item execution with its return
value.  This is necessary because multiple workers may contend for the
manager role, and, if there already is a manager, others should
proceed to work item execution.

Unfortunately, the function also indicates that the worker may proceed
to work item execution if need_to_create_worker() is false at the head
of the function.  need_to_create_worker() tests the following
conditions.

	pending work items && !nr_running && !nr_idle

The first and third conditions are protected by pool->lock and thus
won't change while holding pool->lock; however, nr_running can change
asynchronously as other workers block and resume and while it's likely
to be zero, as someone woke this worker up in the first place, some
other workers could have become runnable inbetween making it non-zero.

If this happens, manage_worker() could return false even with zero
nr_idle making the worker, the last idle one, proceed to execute work
items.  If then all workers of the pool end up blocking on a resource
which can only be released by a work item which is pending on that
pool, the whole pool can deadlock as there's no one to create more
workers or summon the rescuers.

This patch fixes the problem by removing the early exit condition from
maybe_create_worker() and making manage_workers() return false iff
there's already another manager, which ensures that the last worker
doesn't start executing work items.

We can leave the early exit condition alone and just ignore the return
value but the only reason it was put there is because the
manage_workers() used to perform both creations and destructions of
workers and thus the function may be invoked while the pool is trying
to reduce the number of workers.  Now that manage_workers() is called
only when more workers are needed, the only case this early exit
condition is triggered is rare race conditions rendering it pointless.

Tested with simulated workload and modified workqueue code which
trigger the pool deadlock reliably without this patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Eric Sandeen <sandeen@sandeen.net>
Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net
Cc: Dave Chinner <david@fromorbit.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: stable@vger.kernel.org
---
Hello,

It took quite some effort to reproduce the issue and verify the fix,
but this works.  Applying to libata/for-3.19-fixes.

Thansk.

 kernel/workqueue.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6202b08..beeeac9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
  * spin_lock_irq(pool->lock) which may be released and regrabbed
  * multiple times.  Does GFP_KERNEL allocations.  Called only from
  * manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
  */
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
 __releases(&pool->lock)
 __acquires(&pool->lock)
 {
-	if (!need_to_create_worker(pool))
-		return false;
 restart:
 	spin_unlock_irq(&pool->lock);

@@ -1877,7 +1871,6 @@ restart:
 	 */
 	if (need_to_create_worker(pool))
 		goto restart;
-	return true;
 }

 /**
@@ -1897,16 +1890,14 @@ restart:
  * multiple times.  Does GFP_KERNEL allocations.
  *
  * Return:
- * %false if the pool don't need management and the caller can safely start
- * processing works, %true indicates that the function released pool->lock
- * and reacquired it to perform some management function and that the
- * conditions that the caller verified while holding the lock before
- * calling the function might no longer be true.
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
  */
 static bool manage_workers(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	bool ret = false;

 	/*
 	 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
 	 * actual management, the pool may stall indefinitely.
 	 */
 	if (!mutex_trylock(&pool->manager_arb))
-		return ret;
+		return false;

-	ret |= maybe_create_worker(pool);
+	maybe_create_worker(pool);

 	mutex_unlock(&pool->manager_arb);
-	return ret;
+	return true;
 }

 /**
-- 
2.1.0

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH workqueue wq/for-3.19-fixes] workqueue: fix subtle pool management issue which can stall whole worker_pool
  2015-01-16 19:32                             ` [PATCH workqueue wq/for-3.19-fixes] " Tejun Heo
@ 2015-01-19  2:15                               ` Lai Jiangshan
  0 siblings, 0 replies; 25+ messages in thread
From: Lai Jiangshan @ 2015-01-19  2:15 UTC (permalink / raw)
  To: Tejun Heo, Eric Sandeen; +Cc: Eric Sandeen, linux-kernel, xfs-oss

On 01/17/2015 03:32 AM, Tejun Heo wrote:
>>From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@kernel.org>
> Date: Fri, 16 Jan 2015 14:21:16 -0500
> 
> A worker_pool's forward progress is guaranteed by the fact that the
> last idle worker assumes the manager role to create more workers and
> summon the rescuers if creating workers doesn't succeed in timely
> manner before proceeding to execute work items.
> 
> This manager role is implemented in manage_workers(), which indicates
> whether the worker may proceed to work item execution with its return
> value.  This is necessary because multiple workers may contend for the
> manager role, and, if there already is a manager, others should
> proceed to work item execution.
> 
> Unfortunately, the function also indicates that the worker may proceed
> to work item execution if need_to_create_worker() is false at the head
> of the function.  need_to_create_worker() tests the following
> conditions.
> 
> 	pending work items && !nr_running && !nr_idle

> 
> The first and third conditions are protected by pool->lock and thus
> won't change while holding pool->lock; however, nr_running can change
> asynchronously as other workers block and resume and while it's likely
> to be zero, as someone woke this worker up in the first place, some
> other workers could have become runnable inbetween making it non-zero.

I had sent a patch similar:
https://lkml.org/lkml/2014/7/10/446

It was shame for me that I did not think deep enough that time.

> 
> If this happens, manage_worker() could return false even with zero
> nr_idle making the worker, the last idle one, proceed to execute work
> items.  If then all workers of the pool end up blocking on a resource
> which can only be released by a work item which is pending on that
> pool, the whole pool can deadlock as there's no one to create more
> workers or summon the rescuers.

How nr_running is decreased to zero in this case?

( The decreasing of nr_running is also protected by "X" )

( I just checked the cpu-hotplug code again ... find no suspect)

> -static bool maybe_create_worker(struct worker_pool *pool)
> +static void maybe_create_worker(struct worker_pool *pool)
>  __releases(&pool->lock)
>  __acquires(&pool->lock)
>  {
> -	if (!need_to_create_worker(pool))
> -		return false;


It only returns false here, if there ware bug, the bug would be here.
But it still holds the pool->lock and no releasing form the beginning to here)

My doubt might be wrong, but at least it is a good cleanup
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>

Thanks
Lai

>  restart:
>  	spin_unlock_irq(&pool->lock);
>  
> @@ -1877,7 +1871,6 @@ restart:
>  	 */
>  	if (need_to_create_worker(pool))
>  		goto restart;
> -	return true;
>  }
>  
>  /**
> @@ -1897,16 +1890,14 @@ restart:
>   * multiple times.  Does GFP_KERNEL allocations.
>   *
>   * Return:
> - * %false if the pool don't need management and the caller can safely start
> - * processing works, %true indicates that the function released pool->lock
> - * and reacquired it to perform some management function and that the
> - * conditions that the caller verified while holding the lock before
> - * calling the function might no longer be true.
> + * %false if the pool doesn't need management and the caller can safely
> + * start processing works, %true if management function was performed and
> + * the conditions that the caller verified before calling the function may
> + * no longer be true.
>   */
>  static bool manage_workers(struct worker *worker)
>  {
>  	struct worker_pool *pool = worker->pool;
> -	bool ret = false;
>  
>  	/*
>  	 * Anyone who successfully grabs manager_arb wins the arbitration
> @@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
>  	 * actual management, the pool may stall indefinitely.
>  	 */
>  	if (!mutex_trylock(&pool->manager_arb))
> -		return ret;
> +		return false;
>  
> -	ret |= maybe_create_worker(pool);
> +	maybe_create_worker(pool);
>  
>  	mutex_unlock(&pool->manager_arb);
> -	return ret;
> +	return true;
>  }
>  
>  /**
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2015-01-19  2:30 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-01-09 18:08 [PATCH 0/2] xfs: make xfs allocation workqueue per-mount, and high priority Eric Sandeen
2015-01-09 18:10 ` [PATCH 1/2] xfs: make global xfsalloc workqueue per-mount Eric Sandeen
2015-01-12 15:35   ` Brian Foster
2015-01-09 18:12 ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Eric Sandeen
2015-01-09 18:23   ` Tejun Heo
2015-01-09 20:36     ` Eric Sandeen
2015-01-10 19:28       ` Tejun Heo
2015-01-11  0:04         ` Eric Sandeen
2015-01-11  6:33           ` Tejun Heo
2015-01-12 20:09         ` Eric Sandeen
2015-01-12 22:53           ` Tejun Heo
2015-01-12 23:12             ` Eric Sandeen
2015-01-12 23:37               ` Tejun Heo
2015-01-13 19:08                 ` Eric Sandeen
2015-01-13 20:19                   ` Tejun Heo
2015-01-13 20:29                     ` Eric Sandeen
2015-01-13 20:46                       ` Tejun Heo
2015-01-13 22:58                         ` Eric Sandeen
2015-01-13 23:35                           ` [PATCH wq/for-3.19] workqueue: fix subtle pool management issue which can stall whole worker_pool Tejun Heo
2015-01-16 19:32                             ` [PATCH workqueue wq/for-3.19-fixes] " Tejun Heo
2015-01-19  2:15                               ` Lai Jiangshan
2015-01-09 23:28     ` [PATCH 2/2] xfs: mark the xfs-alloc workqueue as high priority Dave Chinner
2015-01-10 17:41       ` Tejun Heo
2015-01-12  3:30         ` Dave Chinner
2015-01-13 20:50           ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox