[PATCH 1/2] btrfs: reada: limit max works count

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] btrfs: reada: limit max works count
@ 2016-01-12  7:46 Zhao Lei
  2016-01-12  7:46 ` [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing Zhao Lei
  2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason
  0 siblings, 2 replies; 12+ messages in thread
From: Zhao Lei @ 2016-01-12  7:46 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Zhao Lei

reada create 2 works for each level of tree in recursion.

In case of a tree having many levels, the number of created works
is 2^level_of_tree.
Actually we don't need so many works in parallel, this patch limit
max works to BTRFS_MAX_MIRRORS * 2.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
---
 fs/btrfs/reada.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 53ee7b1..7b150b2 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -103,6 +103,9 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info);
 static int reada_add_block(struct reada_control *rc, u64 logical,
 			   struct btrfs_key *top, u64 generation);
 
+/* To limit max reada works */
+static atomic_t works_cnt = ATOMIC_INIT(0);
+
 /* recurses */
 /* in case of err, eb might be NULL */
 static void __readahead_hook(struct btrfs_fs_info *fs_info,
@@ -759,6 +762,8 @@ static void reada_start_machine_worker(struct btrfs_work *work)
 	set_task_ioprio(current, BTRFS_IOPRIO_READA);
 	__reada_start_machine(fs_info);
 	set_task_ioprio(current, old_ioprio);
+
+	atomic_dec(&works_cnt);
 }
 
 static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -790,8 +795,11 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
 	 * enqueue to workers to finish it. This will distribute the load to
 	 * the cores.
 	 */
-	for (i = 0; i < 2; ++i)
+	for (i = 0; i < 2; ++i) {
 		reada_start_machine(fs_info);
+		if (atomic_read(&works_cnt) > BTRFS_MAX_MIRRORS * 2)
+			break;
+	}
 }
 
 static void reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -808,6 +816,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
 	rmw->fs_info = fs_info;
 
 	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
+	atomic_inc(&works_cnt);
 }
 
 #ifdef DEBUG
-- 
1.8.5.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing
  2016-01-12  7:46 [PATCH 1/2] btrfs: reada: limit max works count Zhao Lei
@ 2016-01-12  7:46 ` Zhao Lei
  2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason
  1 sibling, 0 replies; 12+ messages in thread
From: Zhao Lei @ 2016-01-12  7:46 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Zhao Lei

No need to decrease dev->reada_in_flight in __readahead_hook()'s
internal and reada_extent_put().
reada_extent_put() have no chance to decrease dev->reada_in_flight
in free operation, because reada_extent have additional refcnt when
scheduled to a dev.

We can put inc and dec operation for dev->reada_in_flight to one
place instead to make logic simple and safe, and move useless
reada_extent->scheduled_for to a bool flag instead.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
---
 fs/btrfs/reada.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 7b150b2..87c211a 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -72,7 +72,7 @@ struct reada_extent {
 	spinlock_t		lock;
 	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
 	int			nzones;
-	struct btrfs_device	*scheduled_for;
+	int			scheduled;
 };
 
 struct reada_zone {
@@ -118,7 +118,6 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info,
 	u64 bytenr;
 	u64 generation;
 	struct list_head list;
-	struct btrfs_device *for_dev;
 
 	if (eb)
 		level = btrfs_header_level(eb);
@@ -129,8 +128,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info,
 	 * don't need the lock anymore
 	 */
 	list_replace_init(&re->extctl, &list);
-	for_dev = re->scheduled_for;
-	re->scheduled_for = NULL;
+	re->scheduled = 0;
 	spin_unlock(&re->lock);
 
 	/*
@@ -215,9 +213,6 @@ cleanup:
 		reada_extent_put(fs_info, re);	/* one ref for each entry */
 	}
 
-	if (for_dev)
-		atomic_dec(&for_dev->reada_in_flight);
-
 	return;
 }
 
@@ -543,8 +538,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 		kref_put(&zone->refcnt, reada_zone_release);
 		spin_unlock(&fs_info->reada_lock);
 	}
-	if (re->scheduled_for)
-		atomic_dec(&re->scheduled_for->reada_in_flight);
 
 	kfree(re);
 }
@@ -710,12 +703,12 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 	spin_unlock(&fs_info->reada_lock);
 
 	spin_lock(&re->lock);
-		if (re->scheduled_for || list_empty(&re->extctl)) {
+		if (re->scheduled || list_empty(&re->extctl)) {
 			spin_unlock(&re->lock);
 			reada_extent_put(fs_info, re);
 			return 0;
 		}
-		re->scheduled_for = dev;
+		re->scheduled = 1;
 	spin_unlock(&re->lock);
 
 	/*
@@ -740,6 +733,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 	if (eb)
 		free_extent_buffer(eb);
 
+	atomic_dec(&dev->reada_in_flight);
 	reada_extent_put(fs_info, re);
 
 	return 1;
@@ -864,10 +858,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 			if (ret == 0)
 				break;
 			printk(KERN_DEBUG
-				"  re: logical %llu size %u empty %d for %lld",
+				"  re: logical %llu size %u empty %d scheduled %d",
 				re->logical, fs_info->tree_root->nodesize,
-				list_empty(&re->extctl), re->scheduled_for ?
-				re->scheduled_for->devid : -1);
+				list_empty(&re->extctl), re->scheduled);
 
 			for (i = 0; i < re->nzones; ++i) {
 				printk(KERN_CONT " zone %llu-%llu devs",
@@ -894,15 +887,14 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 					     index, 1);
 		if (ret == 0)
 			break;
-		if (!re->scheduled_for) {
+		if (!re->scheduled) {
 			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
 			continue;
 		}
 		printk(KERN_DEBUG
-			"re: logical %llu size %u list empty %d for %lld",
+			"re: logical %llu size %u list empty %d scheduled %d",
 			re->logical, fs_info->tree_root->nodesize,
-			list_empty(&re->extctl),
-			re->scheduled_for ? re->scheduled_for->devid : -1);
+			list_empty(&re->extctl), re->scheduled);
 		for (i = 0; i < re->nzones; ++i) {
 			printk(KERN_CONT " zone %llu-%llu devs",
 				re->zones[i]->start,
-- 
1.8.5.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-12  7:46 [PATCH 1/2] btrfs: reada: limit max works count Zhao Lei
  2016-01-12  7:46 ` [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing Zhao Lei
@ 2016-01-20 15:16 ` Chris Mason
  2016-01-20 17:48   ` Chris Mason
  1 sibling, 1 reply; 12+ messages in thread
From: Chris Mason @ 2016-01-20 15:16 UTC (permalink / raw)
  To: Zhao Lei; +Cc: linux-btrfs

On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> reada create 2 works for each level of tree in recursion.
> 
> In case of a tree having many levels, the number of created works
> is 2^level_of_tree.
> Actually we don't need so many works in parallel, this patch limit
> max works to BTRFS_MAX_MIRRORS * 2.

Hi,

I don't think you end up calling atomic_dec() for every time that
reada_start_machine() is called.  Also, I'd rather not have a global
static variable to limit the parallel workers, when we have more than
one FS mounted it'll end up limiting things too much.

With this patch applied, I'm seeing deadlocks during btrfs/066.    You
have to run the scrub tests as well, basically we're just getting
fsstress run alongside scrub.

I'll run a few more times with it reverted to make sure, but I think
it's the root cause.

-----
stack summary

6 hits: 
[<ffffffff813ec92a>] wait_current_trans+0xca/0x140
[<ffffffff813ee248>] start_transaction+0x278/0x5b0
[<ffffffff813eea97>] btrfs_attach_transaction_barrier+0x27/0x60
[<ffffffff813b4835>] btrfs_sync_fs+0x85/0x1d0
[<ffffffff8122bcf0>] sync_fs_one_sb+0x20/0x30
[<ffffffff811f579f>] iterate_supers+0xaf/0xe0
[<ffffffff8122c1e5>] sys_sync+0x55/0x90
[<ffffffff819c00c7>] tracesys_phase2+0x84/0x89
[<ffffffffffffffff>] 0xffffffffffffffff

-----
1 hit:
[<ffffffff813ec92a>] wait_current_trans+0xca/0x140
[<ffffffff813ee248>] start_transaction+0x278/0x5b0
[<ffffffff813ee597>] btrfs_attach_transaction+0x17/0x20
[<ffffffff813e6b27>] transaction_kthread+0x1b7/0x290
[<ffffffff81082e09>] kthread+0xe9/0x110
[<ffffffff819c02ff>] ret_from_fork+0x3f/0x70
[<ffffffffffffffff>] 0xffffffffffffffff

-----
[<ffffffff814506cf>] btrfs_scrub_pause+0xdf/0x150
[<ffffffff813ed2f4>] btrfs_commit_transaction+0x3b4/0xc70
[<ffffffff81424724>] create_subvol+0x504/0x8d0
[<ffffffff81424c63>] btrfs_mksubvol+0x173/0x510
[<ffffffff8142511e>] btrfs_ioctl_snap_create_transid+0x11e/0x1a0
[<ffffffff814251fe>] btrfs_ioctl_snap_create+0x5e/0x80
[<ffffffff8142dbbb>] btrfs_ioctl+0xc6b/0x1190
[<ffffffff8120624a>] do_vfs_ioctl+0x8a/0x560
[<ffffffff812067b2>] SyS_ioctl+0x92/0xa0
[<ffffffff819c00c7>] tracesys_phase2+0x84/0x89
[<ffffffffffffffff>] 0xffffffffffffffff

-----
[<ffffffff81458d36>] btrfs_reada_wait+0x86/0xf0
[<ffffffff81456dc4>] scrub_stripe+0x274/0x1180
[<ffffffff81457de9>] scrub_chunk+0x119/0x160
[<ffffffff814581b7>] scrub_enumerate_chunks+0x387/0x730
[<ffffffff81458740>] btrfs_scrub_dev+0x1e0/0x620
[<ffffffff8142b7d1>] btrfs_ioctl_scrub+0xb1/0x120
[<ffffffff8142d970>] btrfs_ioctl+0xa20/0x1190
[<ffffffff8120624a>] do_vfs_ioctl+0x8a/0x560
[<ffffffff812067b2>] SyS_ioctl+0x92/0xa0
[<ffffffff819c00c7>] tracesys_phase2+0x84/0x89
[<ffffffffffffffff>] 0xffffffffffffffff


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason
@ 2016-01-20 17:48   ` Chris Mason
  2016-01-21  3:36     ` Zhao Lei
  2016-01-21 10:06     ` Zhao Lei
  0 siblings, 2 replies; 12+ messages in thread
From: Chris Mason @ 2016-01-20 17:48 UTC (permalink / raw)
  To: Zhao Lei, linux-btrfs

On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > reada create 2 works for each level of tree in recursion.
> > 
> > In case of a tree having many levels, the number of created works
> > is 2^level_of_tree.
> > Actually we don't need so many works in parallel, this patch limit
> > max works to BTRFS_MAX_MIRRORS * 2.
> 
> Hi,
> 
> I don't think you end up calling atomic_dec() for every time that
> reada_start_machine() is called.  Also, I'd rather not have a global
> static variable to limit the parallel workers, when we have more than
> one FS mounted it'll end up limiting things too much.
> 
> With this patch applied, I'm seeing deadlocks during btrfs/066.    You
> have to run the scrub tests as well, basically we're just getting
> fsstress run alongside scrub.
> 
> I'll run a few more times with it reverted to make sure, but I think
> it's the root cause.

I spoke too soon, it ended up deadlocking a few tests later.  Sorry for
now I'm pulling all the reada patches.  We'll sort out bug fixes vs
cleanups in later rcs.

With all of the reada patches removed, the deadlocks are gone.

-chris

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-20 17:48   ` Chris Mason
@ 2016-01-21  3:36     ` Zhao Lei
  2016-01-21 10:06     ` Zhao Lei
  1 sibling, 0 replies; 12+ messages in thread
From: Zhao Lei @ 2016-01-21  3:36 UTC (permalink / raw)
  To: 'Chris Mason', linux-btrfs



> -----Original Message-----
> From: Chris Mason [mailto:clm@fb.com]
> Sent: Thursday, January 21, 2016 1:48 AM
> To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org
> Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> 
> On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > > reada create 2 works for each level of tree in recursion.
> > >
> > > In case of a tree having many levels, the number of created works is
> > > 2^level_of_tree.
> > > Actually we don't need so many works in parallel, this patch limit
> > > max works to BTRFS_MAX_MIRRORS * 2.
> >
> > Hi,
> >
> > I don't think you end up calling atomic_dec() for every time that
> > reada_start_machine() is called.  Also, I'd rather not have a global
> > static variable to limit the parallel workers, when we have more than
> > one FS mounted it'll end up limiting things too much.
> >
> > With this patch applied, I'm seeing deadlocks during btrfs/066.    You
> > have to run the scrub tests as well, basically we're just getting
> > fsstress run alongside scrub.
> >
> > I'll run a few more times with it reverted to make sure, but I think
> > it's the root cause.
> 
> I spoke too soon, it ended up deadlocking a few tests later.  Sorry for now I'm
> pulling all the reada patches.  We'll sort out bug fixes vs cleanups in later rcs.
> 
> With all of the reada patches removed, the deadlocks are gone.
> 
Sorry for hear it.

Actually I run xfstests with all patch applied, and see no regression in my env:

FSTYP         -- btrfs
PLATFORM      -- Linux/x86_64 lenovo 4.4.0-rc6_HEAD_8e16378041f7f3531c256fd3e17a36a4fca92d29_+
MKFS_OPTIONS  -- /dev/sdb6
MOUNT_OPTIONS -- /dev/sdb6 /var/ltf/tester/scratch_mnt

btrfs/066 151s ... 164s
Ran: btrfs/066
Passed all 1 tests

I'll investigate the root reason.

Thanks
Zhaolei

> -chris





^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-20 17:48   ` Chris Mason
  2016-01-21  3:36     ` Zhao Lei
@ 2016-01-21 10:06     ` Zhao Lei
  2016-01-21 14:14       ` Chris Mason
  1 sibling, 1 reply; 12+ messages in thread
From: Zhao Lei @ 2016-01-21 10:06 UTC (permalink / raw)
  To: 'Chris Mason', linux-btrfs

Hi, Chris Mason

> -----Original Message-----
> From: Chris Mason [mailto:clm@fb.com]
> Sent: Thursday, January 21, 2016 1:48 AM
> To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org
> Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> 
> On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > > reada create 2 works for each level of tree in recursion.
> > >
> > > In case of a tree having many levels, the number of created works is
> > > 2^level_of_tree.
> > > Actually we don't need so many works in parallel, this patch limit
> > > max works to BTRFS_MAX_MIRRORS * 2.
> >
> > Hi,
> >
> > I don't think you end up calling atomic_dec() for every time that
> > reada_start_machine() is called.  Also, I'd rather not have a global
> > static variable to limit the parallel workers, when we have more than
> > one FS mounted it'll end up limiting things too much.
> >
> > With this patch applied, I'm seeing deadlocks during btrfs/066.    You
> > have to run the scrub tests as well, basically we're just getting
> > fsstress run alongside scrub.
> >
> > I'll run a few more times with it reverted to make sure, but I think
> > it's the root cause.
> 
> I spoke too soon, it ended up deadlocking a few tests later.
>
In logic, even if the calculation of atomic_dec() in this patch having bug,
in worst condition, reada will works in single-thread mode, and will not
introduce deadlock.

And by looking the backtrace in this mail, maybe it is caused by
reada_control->elems in someplace of this patchset.

I recheck xfstests/066 in both vm and physical machine, on top of my pull-request
git today, with btrfs-progs 4.4 for many times, but had not triggered the bug.

Could you tell me your test environment(TEST_DEV size, mount option),
and odds of fails in btrfs/066?

Thanks
Zhaolei

> Sorry for now I'm pulling all the reada patches.  We'll sort out bug fixes vs cleanups in later rcs.
> 
> With all of the reada patches removed, the deadlocks are gone.
> 
> -chris





^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-21 10:06     ` Zhao Lei
@ 2016-01-21 14:14       ` Chris Mason
  2016-01-22 12:25         ` Zhao Lei
  0 siblings, 1 reply; 12+ messages in thread
From: Chris Mason @ 2016-01-21 14:14 UTC (permalink / raw)
  To: Zhao Lei; +Cc: linux-btrfs

On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote:
> Hi, Chris Mason
> 
> > -----Original Message-----
> > From: Chris Mason [mailto:clm@fb.com]
> > Sent: Thursday, January 21, 2016 1:48 AM
> > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org
> > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> > 
> > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > > > reada create 2 works for each level of tree in recursion.
> > > >
> > > > In case of a tree having many levels, the number of created works is
> > > > 2^level_of_tree.
> > > > Actually we don't need so many works in parallel, this patch limit
> > > > max works to BTRFS_MAX_MIRRORS * 2.
> > >
> > > Hi,
> > >
> > > I don't think you end up calling atomic_dec() for every time that
> > > reada_start_machine() is called.  Also, I'd rather not have a global
> > > static variable to limit the parallel workers, when we have more than
> > > one FS mounted it'll end up limiting things too much.
> > >
> > > With this patch applied, I'm seeing deadlocks during btrfs/066.    You
> > > have to run the scrub tests as well, basically we're just getting
> > > fsstress run alongside scrub.
> > >
> > > I'll run a few more times with it reverted to make sure, but I think
> > > it's the root cause.
> > 
> > I spoke too soon, it ended up deadlocking a few tests later.
> >
> In logic, even if the calculation of atomic_dec() in this patch having bug,
> in worst condition, reada will works in single-thread mode, and will not
> introduce deadlock.
> 
> And by looking the backtrace in this mail, maybe it is caused by
> reada_control->elems in someplace of this patchset.
> 
> I recheck xfstests/066 in both vm and physical machine, on top of my pull-request
> git today, with btrfs-progs 4.4 for many times, but had not triggered the bug.

Just running 066 alone doesn't trigger it for me.  I have to run
everything from 00->066.

My setup is 5 drives.  I use a script to carve them up into logical
volumes, 5 for the test device and 5 for the scratch pool.  I think it
should reproduce with a single drive, if you still can't trigger I'll
confirm that.

> 
> Could you tell me your test environment(TEST_DEV size, mount option),
> and odds of fails in btrfs/066?

100% odds of failing, one time it made it up to btrfs/072.  I think more
important than the drive setup is that I have all the debugging on.
CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and lock dep
enabled.

-chris

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-21 14:14       ` Chris Mason
@ 2016-01-22 12:25         ` Zhao Lei
  2016-01-22 14:19           ` Chris Mason
  0 siblings, 1 reply; 12+ messages in thread
From: Zhao Lei @ 2016-01-22 12:25 UTC (permalink / raw)
  To: 'Chris Mason'; +Cc: linux-btrfs

Hi, Chris Mason

> -----Original Message-----
> From: Chris Mason [mailto:clm@fb.com]
> Sent: Thursday, January 21, 2016 10:15 PM
> To: Zhao Lei <zhaolei@cn.fujitsu.com>
> Cc: linux-btrfs@vger.kernel.org
> Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> 
> On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote:
> > Hi, Chris Mason
> >
> > > -----Original Message-----
> > > From: Chris Mason [mailto:clm@fb.com]
> > > Sent: Thursday, January 21, 2016 1:48 AM
> > > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org
> > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> > >
> > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > > > > reada create 2 works for each level of tree in recursion.
> > > > >
> > > > > In case of a tree having many levels, the number of created
> > > > > works is 2^level_of_tree.
> > > > > Actually we don't need so many works in parallel, this patch
> > > > > limit max works to BTRFS_MAX_MIRRORS * 2.
> > > >
> > > > Hi,
> > > >
> > > > I don't think you end up calling atomic_dec() for every time that
> > > > reada_start_machine() is called.  Also, I'd rather not have a
> > > > global static variable to limit the parallel workers, when we have
> > > > more than one FS mounted it'll end up limiting things too much.
> > > >
> > > > With this patch applied, I'm seeing deadlocks during btrfs/066.    You
> > > > have to run the scrub tests as well, basically we're just getting
> > > > fsstress run alongside scrub.
> > > >
> > > > I'll run a few more times with it reverted to make sure, but I
> > > > think it's the root cause.
> > >
> > > I spoke too soon, it ended up deadlocking a few tests later.
> > >
> > In logic, even if the calculation of atomic_dec() in this patch having
> > bug, in worst condition, reada will works in single-thread mode, and
> > will not introduce deadlock.
> >
> > And by looking the backtrace in this mail, maybe it is caused by
> > reada_control->elems in someplace of this patchset.
> >
> > I recheck xfstests/066 in both vm and physical machine, on top of my
> > pull-request git today, with btrfs-progs 4.4 for many times, but had not
> triggered the bug.
> 
> Just running 066 alone doesn't trigger it for me.  I have to run everything from
> 00->066.
> 
> My setup is 5 drives.  I use a script to carve them up into logical volumes, 5 for
> the test device and 5 for the scratch pool.  I think it should reproduce with a
> single drive, if you still can't trigger I'll confirm that.
> 
> >
> > Could you tell me your test environment(TEST_DEV size, mount option),
> > and odds of fails in btrfs/066?
> 
> 100% odds of failing, one time it made it up to btrfs/072.  I think more
> important than the drive setup is that I have all the debugging on.
> CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and lock
> dep enabled.
> 
Thanks for your answer.

But unfortunately I hadn't reproduce the dead_lock in above way today...
Now I queued loop of above reproduce script in more nodes, and hopes
it can happen in this weekend.

And by reviewing code, I found a problem which can introduce similar bad result
in logic, and made a patch for it.
[PATCH] [RFC] btrfs: reada: avoid undone reada extents in btrfs_reada_wait

Because it is only a problem in logic, but rarely happened, I only confirmed
no-problem after patch applied.

Sorry for increased your works, could you apply this patch and test is it
works?

Thanks
Zhaolei

> -chris





^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-22 12:25         ` Zhao Lei
@ 2016-01-22 14:19           ` Chris Mason
  2016-01-26  9:08             ` Zhao Lei
  2016-01-28  7:49             ` Zhao Lei
  0 siblings, 2 replies; 12+ messages in thread
From: Chris Mason @ 2016-01-22 14:19 UTC (permalink / raw)
  To: Zhao Lei; +Cc: linux-btrfs

On Fri, Jan 22, 2016 at 08:25:56PM +0800, Zhao Lei wrote:
> Hi, Chris Mason
> 
> > -----Original Message-----
> > From: Chris Mason [mailto:clm@fb.com]
> > Sent: Thursday, January 21, 2016 10:15 PM
> > To: Zhao Lei <zhaolei@cn.fujitsu.com>
> > Cc: linux-btrfs@vger.kernel.org
> > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> > 
> > On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote:
> > > Hi, Chris Mason
> > >
> > > > -----Original Message-----
> > > > From: Chris Mason [mailto:clm@fb.com]
> > > > Sent: Thursday, January 21, 2016 1:48 AM
> > > > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org
> > > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> > > >
> > > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> > > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > > > > > reada create 2 works for each level of tree in recursion.
> > > > > >
> > > > > > In case of a tree having many levels, the number of created
> > > > > > works is 2^level_of_tree.
> > > > > > Actually we don't need so many works in parallel, this patch
> > > > > > limit max works to BTRFS_MAX_MIRRORS * 2.
> > > > >
> > > > > Hi,
> > > > >
> > > > > I don't think you end up calling atomic_dec() for every time that
> > > > > reada_start_machine() is called.  Also, I'd rather not have a
> > > > > global static variable to limit the parallel workers, when we have
> > > > > more than one FS mounted it'll end up limiting things too much.
> > > > >
> > > > > With this patch applied, I'm seeing deadlocks during btrfs/066.    You
> > > > > have to run the scrub tests as well, basically we're just getting
> > > > > fsstress run alongside scrub.
> > > > >
> > > > > I'll run a few more times with it reverted to make sure, but I
> > > > > think it's the root cause.
> > > >
> > > > I spoke too soon, it ended up deadlocking a few tests later.
> > > >
> > > In logic, even if the calculation of atomic_dec() in this patch having
> > > bug, in worst condition, reada will works in single-thread mode, and
> > > will not introduce deadlock.
> > >
> > > And by looking the backtrace in this mail, maybe it is caused by
> > > reada_control->elems in someplace of this patchset.
> > >
> > > I recheck xfstests/066 in both vm and physical machine, on top of my
> > > pull-request git today, with btrfs-progs 4.4 for many times, but had not
> > triggered the bug.
> > 
> > Just running 066 alone doesn't trigger it for me.  I have to run everything from
> > 00->066.
> > 
> > My setup is 5 drives.  I use a script to carve them up into logical volumes, 5 for
> > the test device and 5 for the scratch pool.  I think it should reproduce with a
> > single drive, if you still can't trigger I'll confirm that.
> > 
> > >
> > > Could you tell me your test environment(TEST_DEV size, mount option),
> > > and odds of fails in btrfs/066?
> > 
> > 100% odds of failing, one time it made it up to btrfs/072.  I think more
> > important than the drive setup is that I have all the debugging on.
> > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and lock
> > dep enabled.
> > 
> Thanks for your answer.
> 
> But unfortunately I hadn't reproduce the dead_lock in above way today...
> Now I queued loop of above reproduce script in more nodes, and hopes
> it can happen in this weekend.
> 
> And by reviewing code, I found a problem which can introduce similar bad result
> in logic, and made a patch for it.
> [PATCH] [RFC] btrfs: reada: avoid undone reada extents in btrfs_reada_wait
> 
> Because it is only a problem in logic, but rarely happened, I only confirmed
> no-problem after patch applied.
> 
> Sorry for increased your works, could you apply this patch and test is it
> works?

No problem, I'll try the patch and see if I can get a more reliable way
to reproduce if it doesn't fix things.  Thanks!

-chris


^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-22 14:19           ` Chris Mason
@ 2016-01-26  9:08             ` Zhao Lei
  2016-01-28  7:49             ` Zhao Lei
  1 sibling, 0 replies; 12+ messages in thread
From: Zhao Lei @ 2016-01-26  9:08 UTC (permalink / raw)
  To: 'Chris Mason'; +Cc: linux-btrfs

Hi, Chris Mason

> -----Original Message-----
> From: Chris Mason [mailto:clm@fb.com]
> Sent: Friday, January 22, 2016 10:19 PM
> To: Zhao Lei <zhaolei@cn.fujitsu.com>
> Cc: linux-btrfs@vger.kernel.org
> Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> 
> On Fri, Jan 22, 2016 at 08:25:56PM +0800, Zhao Lei wrote:
> > Hi, Chris Mason
> >
> > > -----Original Message-----
> > > From: Chris Mason [mailto:clm@fb.com]
> > > Sent: Thursday, January 21, 2016 10:15 PM
> > > To: Zhao Lei <zhaolei@cn.fujitsu.com>
> > > Cc: linux-btrfs@vger.kernel.org
> > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> > >
> > > On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote:
> > > > Hi, Chris Mason
> > > >
> > > > > -----Original Message-----
> > > > > From: Chris Mason [mailto:clm@fb.com]
> > > > > Sent: Thursday, January 21, 2016 1:48 AM
> > > > > To: Zhao Lei <zhaolei@cn.fujitsu.com>;
> > > > > linux-btrfs@vger.kernel.org
> > > > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count
> > > > >
> > > > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote:
> > > > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote:
> > > > > > > reada create 2 works for each level of tree in recursion.
> > > > > > >
> > > > > > > In case of a tree having many levels, the number of created
> > > > > > > works is 2^level_of_tree.
> > > > > > > Actually we don't need so many works in parallel, this patch
> > > > > > > limit max works to BTRFS_MAX_MIRRORS * 2.
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > I don't think you end up calling atomic_dec() for every time
> > > > > > that
> > > > > > reada_start_machine() is called.  Also, I'd rather not have a
> > > > > > global static variable to limit the parallel workers, when we
> > > > > > have more than one FS mounted it'll end up limiting things too much.
> > > > > >
> > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066.
> You
> > > > > > have to run the scrub tests as well, basically we're just
> > > > > > getting fsstress run alongside scrub.
> > > > > >
> > > > > > I'll run a few more times with it reverted to make sure, but I
> > > > > > think it's the root cause.
> > > > >
> > > > > I spoke too soon, it ended up deadlocking a few tests later.
> > > > >
> > > > In logic, even if the calculation of atomic_dec() in this patch
> > > > having bug, in worst condition, reada will works in single-thread
> > > > mode, and will not introduce deadlock.
> > > >
> > > > And by looking the backtrace in this mail, maybe it is caused by
> > > > reada_control->elems in someplace of this patchset.
> > > >
> > > > I recheck xfstests/066 in both vm and physical machine, on top of
> > > > my pull-request git today, with btrfs-progs 4.4 for many times,
> > > > but had not
> > > triggered the bug.
> > >
> > > Just running 066 alone doesn't trigger it for me.  I have to run
> > > everything from
> > > 00->066.
> > >
> > > My setup is 5 drives.  I use a script to carve them up into logical
> > > volumes, 5 for the test device and 5 for the scratch pool.  I think
> > > it should reproduce with a single drive, if you still can't trigger I'll confirm
> that.
> > >
> > > >
> > > > Could you tell me your test environment(TEST_DEV size, mount
> > > > option), and odds of fails in btrfs/066?
> > >
> > > 100% odds of failing, one time it made it up to btrfs/072.  I think
> > > more important than the drive setup is that I have all the debugging on.
> > > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and
> lock
> > > dep enabled.
> > >
> > Thanks for your answer.
> >
> > But unfortunately I hadn't reproduce the dead_lock in above way today...
> > Now I queued loop of above reproduce script in more nodes, and hopes
> > it can happen in this weekend.
> >
> > And by reviewing code, I found a problem which can introduce similar
> > bad result in logic, and made a patch for it.
> > [PATCH] [RFC] btrfs: reada: avoid undone reada extents in
> > btrfs_reada_wait
> >
> > Because it is only a problem in logic, but rarely happened, I only
> > confirmed no-problem after patch applied.
> >
> > Sorry for increased your works, could you apply this patch and test is
> > it works?
> 
> No problem, I'll try the patch and see if I can get a more reliable way to
> reproduce if it doesn't fix things.  Thanks!
> 
Thanks for your effective help.

I reproduced the bug in one of my node.
And I got the bug reason.

1: The background read thread in reada is not designed to complete
  all works, as above description in this mail, plus addition case in
  following.

2: For DUP, current code created 2 zones for it, and one of the zone
  is "dummy"(we only read first strip for DUP).
  And when the "dummy" zone is selected, current code ignore
  read action, just bypass and do a cleanup.
  Current code just return without re-select zone in this case to make
  logic easy, and it make code likely to exit reada thread.
  So, in DUP case, more background thread exit before all works done.
  It is why btrfs/066 always hang in DUP profile.

3: This problem exist in old code too, but rarely happened, my patchset
  trigger the problem because:
  a. Limited background thread number
    PATCH: btrfs: reada: limit max works count
    In old code, there are more background threads, and if one of them exit,
    remain threads will continue remain extents.
  b. reduce thread lift time
    PATCH: btrfs: reada: Avoid many times of empty loop
    The lift time of thread is reduced, and make the no-thread window large.

Fix: 
  We have following solution for this problem:
  a. Not add above dummy zone for DUP
    It will reduce the happen odds of the problem, but because
    "device workload limit(MAX_IN_FLIGHT)" and "total reads limit"
    in code, so the problem will still exist in very small case.
  b. let the reada background thread do all works before exit.
    It conflict with the limit design in [a].
  c. Check to ensure we have at least one thread in btrfs_reada_wait()
    It can fix the problem completely.
  So I will fix the problem by way "c", based on:
  [RFC] btrfs: reada: avoid undone reada extents in btrfs_reada_wait
  With some enhancement.

I'll make the fix and test it.

Thanks
Zhaolei

> -chris
> 





^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-22 14:19           ` Chris Mason
  2016-01-26  9:08             ` Zhao Lei
@ 2016-01-28  7:49             ` Zhao Lei
  2016-01-28 13:30               ` Chris Mason
  1 sibling, 1 reply; 12+ messages in thread
From: Zhao Lei @ 2016-01-28  7:49 UTC (permalink / raw)
  To: 'Chris Mason'; +Cc: linux-btrfs

Hi, Chris Mason

> > > > > > > reada create 2 works for each level of tree in recursion.
> > > > > > >
> > > > > > > In case of a tree having many levels, the number of created
> > > > > > > works is 2^level_of_tree.
> > > > > > > Actually we don't need so many works in parallel, this patch
> > > > > > > limit max works to BTRFS_MAX_MIRRORS * 2.
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > I don't think you end up calling atomic_dec() for every time
> > > > > > that
> > > > > > reada_start_machine() is called.  Also, I'd rather not have a
> > > > > > global static variable to limit the parallel workers, when we
> > > > > > have more than one FS mounted it'll end up limiting things too much.
> > > > > >
> > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066.
> You
> > > > > > have to run the scrub tests as well, basically we're just
> > > > > > getting fsstress run alongside scrub.
> > > > > >
> > > > > > I'll run a few more times with it reverted to make sure, but I
> > > > > > think it's the root cause.
> > > > >
> > > > > I spoke too soon, it ended up deadlocking a few tests later.
> > > > >
> > > > In logic, even if the calculation of atomic_dec() in this patch
> > > > having bug, in worst condition, reada will works in single-thread
> > > > mode, and will not introduce deadlock.
> > > >
> > > > And by looking the backtrace in this mail, maybe it is caused by
> > > > reada_control->elems in someplace of this patchset.
> > > >
> > > > I recheck xfstests/066 in both vm and physical machine, on top of
> > > > my pull-request git today, with btrfs-progs 4.4 for many times,
> > > > but had not
> > > triggered the bug.
> > >
> > > Just running 066 alone doesn't trigger it for me.  I have to run
> > > everything from
> > > 00->066.
> > >
> > > My setup is 5 drives.  I use a script to carve them up into logical
> > > volumes, 5 for the test device and 5 for the scratch pool.  I think
> > > it should reproduce with a single drive, if you still can't trigger I'll confirm
> that.
> > >
> > > >
> > > > Could you tell me your test environment(TEST_DEV size, mount
> > > > option), and odds of fails in btrfs/066?
> > >
> > > 100% odds of failing, one time it made it up to btrfs/072.  I think
> > > more important than the drive setup is that I have all the debugging on.
> > > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and
> lock
> > > dep enabled.
> > >
> > Thanks for your answer.
> >
> > But unfortunately I hadn't reproduce the dead_lock in above way today...
> > Now I queued loop of above reproduce script in more nodes, and hopes
> > it can happen in this weekend.
> >
> > And by reviewing code, I found a problem which can introduce similar
> > bad result in logic, and made a patch for it.
> > [PATCH] [RFC] btrfs: reada: avoid undone reada extents in
> > btrfs_reada_wait
> >
> > Because it is only a problem in logic, but rarely happened, I only
> > confirmed no-problem after patch applied.
> >
> > Sorry for increased your works, could you apply this patch and test is
> > it works?
> 
> No problem, I'll try the patch and see if I can get a more reliable way to
> reproduce if it doesn't fix things.  Thanks!
> 

I rebased following branch:
https://github.com/zhaoleidd/btrfs.git integration-4.5

With updated patch to fix btrfs/066 bug.
Bug reason is descripted in changelog of:
btrfs: reada: avoid undone reada extents in btrfs_reada_wait

Test:
1: In the node which can repgoduce btrfs/066 bug,
  Confirmed HAVING_BUG before patch, and NO_BUG after patch.
2: Run xfstests's btrfs group, confirmed no regression.

Most patchs in this branch are for reada, except this one for NO_SPACE bug:
btrfs: Continue write in case of can_not_nocow

Cound you consider merging it in suitable time?

Thanks
Zhaolei

> -chris
> 





^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] btrfs: reada: limit max works count
  2016-01-28  7:49             ` Zhao Lei
@ 2016-01-28 13:30               ` Chris Mason
  0 siblings, 0 replies; 12+ messages in thread
From: Chris Mason @ 2016-01-28 13:30 UTC (permalink / raw)
  To: Zhao Lei; +Cc: linux-btrfs

On Thu, Jan 28, 2016 at 03:49:54PM +0800, Zhao Lei wrote:
> I rebased following branch:
> https://github.com/zhaoleidd/btrfs.git integration-4.5
> 
> With updated patch to fix btrfs/066 bug.
> Bug reason is descripted in changelog of:
> btrfs: reada: avoid undone reada extents in btrfs_reada_wait
> 
> Test:
> 1: In the node which can repgoduce btrfs/066 bug,
>   Confirmed HAVING_BUG before patch, and NO_BUG after patch.
> 2: Run xfstests's btrfs group, confirmed no regression.
> 
> Most patchs in this branch are for reada, except this one for NO_SPACE bug:
> btrfs: Continue write in case of can_not_nocow
> 
> Cound you consider merging it in suitable time?

Thanks for tracking all of this down, I'll take a look.

-chris

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-01-28 13:30 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-01-12  7:46 [PATCH 1/2] btrfs: reada: limit max works count Zhao Lei
2016-01-12  7:46 ` [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing Zhao Lei
2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason
2016-01-20 17:48   ` Chris Mason
2016-01-21  3:36     ` Zhao Lei
2016-01-21 10:06     ` Zhao Lei
2016-01-21 14:14       ` Chris Mason
2016-01-22 12:25         ` Zhao Lei
2016-01-22 14:19           ` Chris Mason
2016-01-26  9:08             ` Zhao Lei
2016-01-28  7:49             ` Zhao Lei
2016-01-28 13:30               ` Chris Mason

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).