* [PATCH 1/2] btrfs: reada: limit max works count @ 2016-01-12 7:46 Zhao Lei 2016-01-12 7:46 ` [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing Zhao Lei 2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason 0 siblings, 2 replies; 12+ messages in thread From: Zhao Lei @ 2016-01-12 7:46 UTC (permalink / raw) To: linux-btrfs; +Cc: Zhao Lei reada create 2 works for each level of tree in recursion. In case of a tree having many levels, the number of created works is 2^level_of_tree. Actually we don't need so many works in parallel, this patch limit max works to BTRFS_MAX_MIRRORS * 2. Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> --- fs/btrfs/reada.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 53ee7b1..7b150b2 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -103,6 +103,9 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info); static int reada_add_block(struct reada_control *rc, u64 logical, struct btrfs_key *top, u64 generation); +/* To limit max reada works */ +static atomic_t works_cnt = ATOMIC_INIT(0); + /* recurses */ /* in case of err, eb might be NULL */ static void __readahead_hook(struct btrfs_fs_info *fs_info, @@ -759,6 +762,8 @@ static void reada_start_machine_worker(struct btrfs_work *work) set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); set_task_ioprio(current, old_ioprio); + + atomic_dec(&works_cnt); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -790,8 +795,11 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) * enqueue to workers to finish it. This will distribute the load to * the cores. */ - for (i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { reada_start_machine(fs_info); + if (atomic_read(&works_cnt) > BTRFS_MAX_MIRRORS * 2) + break; + } } static void reada_start_machine(struct btrfs_fs_info *fs_info) @@ -808,6 +816,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); + atomic_inc(&works_cnt); } #ifdef DEBUG -- 1.8.5.1 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing 2016-01-12 7:46 [PATCH 1/2] btrfs: reada: limit max works count Zhao Lei @ 2016-01-12 7:46 ` Zhao Lei 2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason 1 sibling, 0 replies; 12+ messages in thread From: Zhao Lei @ 2016-01-12 7:46 UTC (permalink / raw) To: linux-btrfs; +Cc: Zhao Lei No need to decrease dev->reada_in_flight in __readahead_hook()'s internal and reada_extent_put(). reada_extent_put() have no chance to decrease dev->reada_in_flight in free operation, because reada_extent have additional refcnt when scheduled to a dev. We can put inc and dec operation for dev->reada_in_flight to one place instead to make logic simple and safe, and move useless reada_extent->scheduled_for to a bool flag instead. Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> --- fs/btrfs/reada.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 7b150b2..87c211a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -72,7 +72,7 @@ struct reada_extent { spinlock_t lock; struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; - struct btrfs_device *scheduled_for; + int scheduled; }; struct reada_zone { @@ -118,7 +118,6 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, u64 bytenr; u64 generation; struct list_head list; - struct btrfs_device *for_dev; if (eb) level = btrfs_header_level(eb); @@ -129,8 +128,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, * don't need the lock anymore */ list_replace_init(&re->extctl, &list); - for_dev = re->scheduled_for; - re->scheduled_for = NULL; + re->scheduled = 0; spin_unlock(&re->lock); /* @@ -215,9 +213,6 @@ cleanup: reada_extent_put(fs_info, re); /* one ref for each entry */ } - if (for_dev) - atomic_dec(&for_dev->reada_in_flight); - return; } @@ -543,8 +538,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - if (re->scheduled_for) - atomic_dec(&re->scheduled_for->reada_in_flight); kfree(re); } @@ -710,12 +703,12 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); spin_lock(&re->lock); - if (re->scheduled_for || list_empty(&re->extctl)) { + if (re->scheduled || list_empty(&re->extctl)) { spin_unlock(&re->lock); reada_extent_put(fs_info, re); return 0; } - re->scheduled_for = dev; + re->scheduled = 1; spin_unlock(&re->lock); /* @@ -740,6 +733,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, if (eb) free_extent_buffer(eb); + atomic_dec(&dev->reada_in_flight); reada_extent_put(fs_info, re); return 1; @@ -864,10 +858,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; printk(KERN_DEBUG - " re: logical %llu size %u empty %d for %lld", + " re: logical %llu size %u empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), re->scheduled_for ? - re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", @@ -894,15 +887,14 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) index, 1); if (ret == 0) break; - if (!re->scheduled_for) { + if (!re->scheduled) { index = (re->logical >> PAGE_CACHE_SHIFT) + 1; continue; } printk(KERN_DEBUG - "re: logical %llu size %u list empty %d for %lld", + "re: logical %llu size %u list empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), - re->scheduled_for ? re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", re->zones[i]->start, -- 1.8.5.1 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-12 7:46 [PATCH 1/2] btrfs: reada: limit max works count Zhao Lei 2016-01-12 7:46 ` [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing Zhao Lei @ 2016-01-20 15:16 ` Chris Mason 2016-01-20 17:48 ` Chris Mason 1 sibling, 1 reply; 12+ messages in thread From: Chris Mason @ 2016-01-20 15:16 UTC (permalink / raw) To: Zhao Lei; +Cc: linux-btrfs On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > reada create 2 works for each level of tree in recursion. > > In case of a tree having many levels, the number of created works > is 2^level_of_tree. > Actually we don't need so many works in parallel, this patch limit > max works to BTRFS_MAX_MIRRORS * 2. Hi, I don't think you end up calling atomic_dec() for every time that reada_start_machine() is called. Also, I'd rather not have a global static variable to limit the parallel workers, when we have more than one FS mounted it'll end up limiting things too much. With this patch applied, I'm seeing deadlocks during btrfs/066. You have to run the scrub tests as well, basically we're just getting fsstress run alongside scrub. I'll run a few more times with it reverted to make sure, but I think it's the root cause. ----- stack summary 6 hits: [<ffffffff813ec92a>] wait_current_trans+0xca/0x140 [<ffffffff813ee248>] start_transaction+0x278/0x5b0 [<ffffffff813eea97>] btrfs_attach_transaction_barrier+0x27/0x60 [<ffffffff813b4835>] btrfs_sync_fs+0x85/0x1d0 [<ffffffff8122bcf0>] sync_fs_one_sb+0x20/0x30 [<ffffffff811f579f>] iterate_supers+0xaf/0xe0 [<ffffffff8122c1e5>] sys_sync+0x55/0x90 [<ffffffff819c00c7>] tracesys_phase2+0x84/0x89 [<ffffffffffffffff>] 0xffffffffffffffff ----- 1 hit: [<ffffffff813ec92a>] wait_current_trans+0xca/0x140 [<ffffffff813ee248>] start_transaction+0x278/0x5b0 [<ffffffff813ee597>] btrfs_attach_transaction+0x17/0x20 [<ffffffff813e6b27>] transaction_kthread+0x1b7/0x290 [<ffffffff81082e09>] kthread+0xe9/0x110 [<ffffffff819c02ff>] ret_from_fork+0x3f/0x70 [<ffffffffffffffff>] 0xffffffffffffffff ----- [<ffffffff814506cf>] btrfs_scrub_pause+0xdf/0x150 [<ffffffff813ed2f4>] btrfs_commit_transaction+0x3b4/0xc70 [<ffffffff81424724>] create_subvol+0x504/0x8d0 [<ffffffff81424c63>] btrfs_mksubvol+0x173/0x510 [<ffffffff8142511e>] btrfs_ioctl_snap_create_transid+0x11e/0x1a0 [<ffffffff814251fe>] btrfs_ioctl_snap_create+0x5e/0x80 [<ffffffff8142dbbb>] btrfs_ioctl+0xc6b/0x1190 [<ffffffff8120624a>] do_vfs_ioctl+0x8a/0x560 [<ffffffff812067b2>] SyS_ioctl+0x92/0xa0 [<ffffffff819c00c7>] tracesys_phase2+0x84/0x89 [<ffffffffffffffff>] 0xffffffffffffffff ----- [<ffffffff81458d36>] btrfs_reada_wait+0x86/0xf0 [<ffffffff81456dc4>] scrub_stripe+0x274/0x1180 [<ffffffff81457de9>] scrub_chunk+0x119/0x160 [<ffffffff814581b7>] scrub_enumerate_chunks+0x387/0x730 [<ffffffff81458740>] btrfs_scrub_dev+0x1e0/0x620 [<ffffffff8142b7d1>] btrfs_ioctl_scrub+0xb1/0x120 [<ffffffff8142d970>] btrfs_ioctl+0xa20/0x1190 [<ffffffff8120624a>] do_vfs_ioctl+0x8a/0x560 [<ffffffff812067b2>] SyS_ioctl+0x92/0xa0 [<ffffffff819c00c7>] tracesys_phase2+0x84/0x89 [<ffffffffffffffff>] 0xffffffffffffffff ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason @ 2016-01-20 17:48 ` Chris Mason 2016-01-21 3:36 ` Zhao Lei 2016-01-21 10:06 ` Zhao Lei 0 siblings, 2 replies; 12+ messages in thread From: Chris Mason @ 2016-01-20 17:48 UTC (permalink / raw) To: Zhao Lei, linux-btrfs On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > reada create 2 works for each level of tree in recursion. > > > > In case of a tree having many levels, the number of created works > > is 2^level_of_tree. > > Actually we don't need so many works in parallel, this patch limit > > max works to BTRFS_MAX_MIRRORS * 2. > > Hi, > > I don't think you end up calling atomic_dec() for every time that > reada_start_machine() is called. Also, I'd rather not have a global > static variable to limit the parallel workers, when we have more than > one FS mounted it'll end up limiting things too much. > > With this patch applied, I'm seeing deadlocks during btrfs/066. You > have to run the scrub tests as well, basically we're just getting > fsstress run alongside scrub. > > I'll run a few more times with it reverted to make sure, but I think > it's the root cause. I spoke too soon, it ended up deadlocking a few tests later. Sorry for now I'm pulling all the reada patches. We'll sort out bug fixes vs cleanups in later rcs. With all of the reada patches removed, the deadlocks are gone. -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-20 17:48 ` Chris Mason @ 2016-01-21 3:36 ` Zhao Lei 2016-01-21 10:06 ` Zhao Lei 1 sibling, 0 replies; 12+ messages in thread From: Zhao Lei @ 2016-01-21 3:36 UTC (permalink / raw) To: 'Chris Mason', linux-btrfs > -----Original Message----- > From: Chris Mason [mailto:clm@fb.com] > Sent: Thursday, January 21, 2016 1:48 AM > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > > reada create 2 works for each level of tree in recursion. > > > > > > In case of a tree having many levels, the number of created works is > > > 2^level_of_tree. > > > Actually we don't need so many works in parallel, this patch limit > > > max works to BTRFS_MAX_MIRRORS * 2. > > > > Hi, > > > > I don't think you end up calling atomic_dec() for every time that > > reada_start_machine() is called. Also, I'd rather not have a global > > static variable to limit the parallel workers, when we have more than > > one FS mounted it'll end up limiting things too much. > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. You > > have to run the scrub tests as well, basically we're just getting > > fsstress run alongside scrub. > > > > I'll run a few more times with it reverted to make sure, but I think > > it's the root cause. > > I spoke too soon, it ended up deadlocking a few tests later. Sorry for now I'm > pulling all the reada patches. We'll sort out bug fixes vs cleanups in later rcs. > > With all of the reada patches removed, the deadlocks are gone. > Sorry for hear it. Actually I run xfstests with all patch applied, and see no regression in my env: FSTYP -- btrfs PLATFORM -- Linux/x86_64 lenovo 4.4.0-rc6_HEAD_8e16378041f7f3531c256fd3e17a36a4fca92d29_+ MKFS_OPTIONS -- /dev/sdb6 MOUNT_OPTIONS -- /dev/sdb6 /var/ltf/tester/scratch_mnt btrfs/066 151s ... 164s Ran: btrfs/066 Passed all 1 tests I'll investigate the root reason. Thanks Zhaolei > -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-20 17:48 ` Chris Mason 2016-01-21 3:36 ` Zhao Lei @ 2016-01-21 10:06 ` Zhao Lei 2016-01-21 14:14 ` Chris Mason 1 sibling, 1 reply; 12+ messages in thread From: Zhao Lei @ 2016-01-21 10:06 UTC (permalink / raw) To: 'Chris Mason', linux-btrfs Hi, Chris Mason > -----Original Message----- > From: Chris Mason [mailto:clm@fb.com] > Sent: Thursday, January 21, 2016 1:48 AM > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > > reada create 2 works for each level of tree in recursion. > > > > > > In case of a tree having many levels, the number of created works is > > > 2^level_of_tree. > > > Actually we don't need so many works in parallel, this patch limit > > > max works to BTRFS_MAX_MIRRORS * 2. > > > > Hi, > > > > I don't think you end up calling atomic_dec() for every time that > > reada_start_machine() is called. Also, I'd rather not have a global > > static variable to limit the parallel workers, when we have more than > > one FS mounted it'll end up limiting things too much. > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. You > > have to run the scrub tests as well, basically we're just getting > > fsstress run alongside scrub. > > > > I'll run a few more times with it reverted to make sure, but I think > > it's the root cause. > > I spoke too soon, it ended up deadlocking a few tests later. > In logic, even if the calculation of atomic_dec() in this patch having bug, in worst condition, reada will works in single-thread mode, and will not introduce deadlock. And by looking the backtrace in this mail, maybe it is caused by reada_control->elems in someplace of this patchset. I recheck xfstests/066 in both vm and physical machine, on top of my pull-request git today, with btrfs-progs 4.4 for many times, but had not triggered the bug. Could you tell me your test environment(TEST_DEV size, mount option), and odds of fails in btrfs/066? Thanks Zhaolei > Sorry for now I'm pulling all the reada patches. We'll sort out bug fixes vs cleanups in later rcs. > > With all of the reada patches removed, the deadlocks are gone. > > -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-21 10:06 ` Zhao Lei @ 2016-01-21 14:14 ` Chris Mason 2016-01-22 12:25 ` Zhao Lei 0 siblings, 1 reply; 12+ messages in thread From: Chris Mason @ 2016-01-21 14:14 UTC (permalink / raw) To: Zhao Lei; +Cc: linux-btrfs On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote: > Hi, Chris Mason > > > -----Original Message----- > > From: Chris Mason [mailto:clm@fb.com] > > Sent: Thursday, January 21, 2016 1:48 AM > > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > > > reada create 2 works for each level of tree in recursion. > > > > > > > > In case of a tree having many levels, the number of created works is > > > > 2^level_of_tree. > > > > Actually we don't need so many works in parallel, this patch limit > > > > max works to BTRFS_MAX_MIRRORS * 2. > > > > > > Hi, > > > > > > I don't think you end up calling atomic_dec() for every time that > > > reada_start_machine() is called. Also, I'd rather not have a global > > > static variable to limit the parallel workers, when we have more than > > > one FS mounted it'll end up limiting things too much. > > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. You > > > have to run the scrub tests as well, basically we're just getting > > > fsstress run alongside scrub. > > > > > > I'll run a few more times with it reverted to make sure, but I think > > > it's the root cause. > > > > I spoke too soon, it ended up deadlocking a few tests later. > > > In logic, even if the calculation of atomic_dec() in this patch having bug, > in worst condition, reada will works in single-thread mode, and will not > introduce deadlock. > > And by looking the backtrace in this mail, maybe it is caused by > reada_control->elems in someplace of this patchset. > > I recheck xfstests/066 in both vm and physical machine, on top of my pull-request > git today, with btrfs-progs 4.4 for many times, but had not triggered the bug. Just running 066 alone doesn't trigger it for me. I have to run everything from 00->066. My setup is 5 drives. I use a script to carve them up into logical volumes, 5 for the test device and 5 for the scratch pool. I think it should reproduce with a single drive, if you still can't trigger I'll confirm that. > > Could you tell me your test environment(TEST_DEV size, mount option), > and odds of fails in btrfs/066? 100% odds of failing, one time it made it up to btrfs/072. I think more important than the drive setup is that I have all the debugging on. CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and lock dep enabled. -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-21 14:14 ` Chris Mason @ 2016-01-22 12:25 ` Zhao Lei 2016-01-22 14:19 ` Chris Mason 0 siblings, 1 reply; 12+ messages in thread From: Zhao Lei @ 2016-01-22 12:25 UTC (permalink / raw) To: 'Chris Mason'; +Cc: linux-btrfs Hi, Chris Mason > -----Original Message----- > From: Chris Mason [mailto:clm@fb.com] > Sent: Thursday, January 21, 2016 10:15 PM > To: Zhao Lei <zhaolei@cn.fujitsu.com> > Cc: linux-btrfs@vger.kernel.org > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote: > > Hi, Chris Mason > > > > > -----Original Message----- > > > From: Chris Mason [mailto:clm@fb.com] > > > Sent: Thursday, January 21, 2016 1:48 AM > > > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org > > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > > > > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > > > > reada create 2 works for each level of tree in recursion. > > > > > > > > > > In case of a tree having many levels, the number of created > > > > > works is 2^level_of_tree. > > > > > Actually we don't need so many works in parallel, this patch > > > > > limit max works to BTRFS_MAX_MIRRORS * 2. > > > > > > > > Hi, > > > > > > > > I don't think you end up calling atomic_dec() for every time that > > > > reada_start_machine() is called. Also, I'd rather not have a > > > > global static variable to limit the parallel workers, when we have > > > > more than one FS mounted it'll end up limiting things too much. > > > > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. You > > > > have to run the scrub tests as well, basically we're just getting > > > > fsstress run alongside scrub. > > > > > > > > I'll run a few more times with it reverted to make sure, but I > > > > think it's the root cause. > > > > > > I spoke too soon, it ended up deadlocking a few tests later. > > > > > In logic, even if the calculation of atomic_dec() in this patch having > > bug, in worst condition, reada will works in single-thread mode, and > > will not introduce deadlock. > > > > And by looking the backtrace in this mail, maybe it is caused by > > reada_control->elems in someplace of this patchset. > > > > I recheck xfstests/066 in both vm and physical machine, on top of my > > pull-request git today, with btrfs-progs 4.4 for many times, but had not > triggered the bug. > > Just running 066 alone doesn't trigger it for me. I have to run everything from > 00->066. > > My setup is 5 drives. I use a script to carve them up into logical volumes, 5 for > the test device and 5 for the scratch pool. I think it should reproduce with a > single drive, if you still can't trigger I'll confirm that. > > > > > Could you tell me your test environment(TEST_DEV size, mount option), > > and odds of fails in btrfs/066? > > 100% odds of failing, one time it made it up to btrfs/072. I think more > important than the drive setup is that I have all the debugging on. > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and lock > dep enabled. > Thanks for your answer. But unfortunately I hadn't reproduce the dead_lock in above way today... Now I queued loop of above reproduce script in more nodes, and hopes it can happen in this weekend. And by reviewing code, I found a problem which can introduce similar bad result in logic, and made a patch for it. [PATCH] [RFC] btrfs: reada: avoid undone reada extents in btrfs_reada_wait Because it is only a problem in logic, but rarely happened, I only confirmed no-problem after patch applied. Sorry for increased your works, could you apply this patch and test is it works? Thanks Zhaolei > -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-22 12:25 ` Zhao Lei @ 2016-01-22 14:19 ` Chris Mason 2016-01-26 9:08 ` Zhao Lei 2016-01-28 7:49 ` Zhao Lei 0 siblings, 2 replies; 12+ messages in thread From: Chris Mason @ 2016-01-22 14:19 UTC (permalink / raw) To: Zhao Lei; +Cc: linux-btrfs On Fri, Jan 22, 2016 at 08:25:56PM +0800, Zhao Lei wrote: > Hi, Chris Mason > > > -----Original Message----- > > From: Chris Mason [mailto:clm@fb.com] > > Sent: Thursday, January 21, 2016 10:15 PM > > To: Zhao Lei <zhaolei@cn.fujitsu.com> > > Cc: linux-btrfs@vger.kernel.org > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > > > On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote: > > > Hi, Chris Mason > > > > > > > -----Original Message----- > > > > From: Chris Mason [mailto:clm@fb.com] > > > > Sent: Thursday, January 21, 2016 1:48 AM > > > > To: Zhao Lei <zhaolei@cn.fujitsu.com>; linux-btrfs@vger.kernel.org > > > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > > > > > > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > > > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > > > > > reada create 2 works for each level of tree in recursion. > > > > > > > > > > > > In case of a tree having many levels, the number of created > > > > > > works is 2^level_of_tree. > > > > > > Actually we don't need so many works in parallel, this patch > > > > > > limit max works to BTRFS_MAX_MIRRORS * 2. > > > > > > > > > > Hi, > > > > > > > > > > I don't think you end up calling atomic_dec() for every time that > > > > > reada_start_machine() is called. Also, I'd rather not have a > > > > > global static variable to limit the parallel workers, when we have > > > > > more than one FS mounted it'll end up limiting things too much. > > > > > > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. You > > > > > have to run the scrub tests as well, basically we're just getting > > > > > fsstress run alongside scrub. > > > > > > > > > > I'll run a few more times with it reverted to make sure, but I > > > > > think it's the root cause. > > > > > > > > I spoke too soon, it ended up deadlocking a few tests later. > > > > > > > In logic, even if the calculation of atomic_dec() in this patch having > > > bug, in worst condition, reada will works in single-thread mode, and > > > will not introduce deadlock. > > > > > > And by looking the backtrace in this mail, maybe it is caused by > > > reada_control->elems in someplace of this patchset. > > > > > > I recheck xfstests/066 in both vm and physical machine, on top of my > > > pull-request git today, with btrfs-progs 4.4 for many times, but had not > > triggered the bug. > > > > Just running 066 alone doesn't trigger it for me. I have to run everything from > > 00->066. > > > > My setup is 5 drives. I use a script to carve them up into logical volumes, 5 for > > the test device and 5 for the scratch pool. I think it should reproduce with a > > single drive, if you still can't trigger I'll confirm that. > > > > > > > > Could you tell me your test environment(TEST_DEV size, mount option), > > > and odds of fails in btrfs/066? > > > > 100% odds of failing, one time it made it up to btrfs/072. I think more > > important than the drive setup is that I have all the debugging on. > > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and lock > > dep enabled. > > > Thanks for your answer. > > But unfortunately I hadn't reproduce the dead_lock in above way today... > Now I queued loop of above reproduce script in more nodes, and hopes > it can happen in this weekend. > > And by reviewing code, I found a problem which can introduce similar bad result > in logic, and made a patch for it. > [PATCH] [RFC] btrfs: reada: avoid undone reada extents in btrfs_reada_wait > > Because it is only a problem in logic, but rarely happened, I only confirmed > no-problem after patch applied. > > Sorry for increased your works, could you apply this patch and test is it > works? No problem, I'll try the patch and see if I can get a more reliable way to reproduce if it doesn't fix things. Thanks! -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-22 14:19 ` Chris Mason @ 2016-01-26 9:08 ` Zhao Lei 2016-01-28 7:49 ` Zhao Lei 1 sibling, 0 replies; 12+ messages in thread From: Zhao Lei @ 2016-01-26 9:08 UTC (permalink / raw) To: 'Chris Mason'; +Cc: linux-btrfs Hi, Chris Mason > -----Original Message----- > From: Chris Mason [mailto:clm@fb.com] > Sent: Friday, January 22, 2016 10:19 PM > To: Zhao Lei <zhaolei@cn.fujitsu.com> > Cc: linux-btrfs@vger.kernel.org > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > On Fri, Jan 22, 2016 at 08:25:56PM +0800, Zhao Lei wrote: > > Hi, Chris Mason > > > > > -----Original Message----- > > > From: Chris Mason [mailto:clm@fb.com] > > > Sent: Thursday, January 21, 2016 10:15 PM > > > To: Zhao Lei <zhaolei@cn.fujitsu.com> > > > Cc: linux-btrfs@vger.kernel.org > > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > > > > > On Thu, Jan 21, 2016 at 06:06:21PM +0800, Zhao Lei wrote: > > > > Hi, Chris Mason > > > > > > > > > -----Original Message----- > > > > > From: Chris Mason [mailto:clm@fb.com] > > > > > Sent: Thursday, January 21, 2016 1:48 AM > > > > > To: Zhao Lei <zhaolei@cn.fujitsu.com>; > > > > > linux-btrfs@vger.kernel.org > > > > > Subject: Re: [PATCH 1/2] btrfs: reada: limit max works count > > > > > > > > > > On Wed, Jan 20, 2016 at 10:16:27AM -0500, Chris Mason wrote: > > > > > > On Tue, Jan 12, 2016 at 03:46:26PM +0800, Zhao Lei wrote: > > > > > > > reada create 2 works for each level of tree in recursion. > > > > > > > > > > > > > > In case of a tree having many levels, the number of created > > > > > > > works is 2^level_of_tree. > > > > > > > Actually we don't need so many works in parallel, this patch > > > > > > > limit max works to BTRFS_MAX_MIRRORS * 2. > > > > > > > > > > > > Hi, > > > > > > > > > > > > I don't think you end up calling atomic_dec() for every time > > > > > > that > > > > > > reada_start_machine() is called. Also, I'd rather not have a > > > > > > global static variable to limit the parallel workers, when we > > > > > > have more than one FS mounted it'll end up limiting things too much. > > > > > > > > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. > You > > > > > > have to run the scrub tests as well, basically we're just > > > > > > getting fsstress run alongside scrub. > > > > > > > > > > > > I'll run a few more times with it reverted to make sure, but I > > > > > > think it's the root cause. > > > > > > > > > > I spoke too soon, it ended up deadlocking a few tests later. > > > > > > > > > In logic, even if the calculation of atomic_dec() in this patch > > > > having bug, in worst condition, reada will works in single-thread > > > > mode, and will not introduce deadlock. > > > > > > > > And by looking the backtrace in this mail, maybe it is caused by > > > > reada_control->elems in someplace of this patchset. > > > > > > > > I recheck xfstests/066 in both vm and physical machine, on top of > > > > my pull-request git today, with btrfs-progs 4.4 for many times, > > > > but had not > > > triggered the bug. > > > > > > Just running 066 alone doesn't trigger it for me. I have to run > > > everything from > > > 00->066. > > > > > > My setup is 5 drives. I use a script to carve them up into logical > > > volumes, 5 for the test device and 5 for the scratch pool. I think > > > it should reproduce with a single drive, if you still can't trigger I'll confirm > that. > > > > > > > > > > > Could you tell me your test environment(TEST_DEV size, mount > > > > option), and odds of fails in btrfs/066? > > > > > > 100% odds of failing, one time it made it up to btrfs/072. I think > > > more important than the drive setup is that I have all the debugging on. > > > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and > lock > > > dep enabled. > > > > > Thanks for your answer. > > > > But unfortunately I hadn't reproduce the dead_lock in above way today... > > Now I queued loop of above reproduce script in more nodes, and hopes > > it can happen in this weekend. > > > > And by reviewing code, I found a problem which can introduce similar > > bad result in logic, and made a patch for it. > > [PATCH] [RFC] btrfs: reada: avoid undone reada extents in > > btrfs_reada_wait > > > > Because it is only a problem in logic, but rarely happened, I only > > confirmed no-problem after patch applied. > > > > Sorry for increased your works, could you apply this patch and test is > > it works? > > No problem, I'll try the patch and see if I can get a more reliable way to > reproduce if it doesn't fix things. Thanks! > Thanks for your effective help. I reproduced the bug in one of my node. And I got the bug reason. 1: The background read thread in reada is not designed to complete all works, as above description in this mail, plus addition case in following. 2: For DUP, current code created 2 zones for it, and one of the zone is "dummy"(we only read first strip for DUP). And when the "dummy" zone is selected, current code ignore read action, just bypass and do a cleanup. Current code just return without re-select zone in this case to make logic easy, and it make code likely to exit reada thread. So, in DUP case, more background thread exit before all works done. It is why btrfs/066 always hang in DUP profile. 3: This problem exist in old code too, but rarely happened, my patchset trigger the problem because: a. Limited background thread number PATCH: btrfs: reada: limit max works count In old code, there are more background threads, and if one of them exit, remain threads will continue remain extents. b. reduce thread lift time PATCH: btrfs: reada: Avoid many times of empty loop The lift time of thread is reduced, and make the no-thread window large. Fix: We have following solution for this problem: a. Not add above dummy zone for DUP It will reduce the happen odds of the problem, but because "device workload limit(MAX_IN_FLIGHT)" and "total reads limit" in code, so the problem will still exist in very small case. b. let the reada background thread do all works before exit. It conflict with the limit design in [a]. c. Check to ensure we have at least one thread in btrfs_reada_wait() It can fix the problem completely. So I will fix the problem by way "c", based on: [RFC] btrfs: reada: avoid undone reada extents in btrfs_reada_wait With some enhancement. I'll make the fix and test it. Thanks Zhaolei > -chris > ^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-22 14:19 ` Chris Mason 2016-01-26 9:08 ` Zhao Lei @ 2016-01-28 7:49 ` Zhao Lei 2016-01-28 13:30 ` Chris Mason 1 sibling, 1 reply; 12+ messages in thread From: Zhao Lei @ 2016-01-28 7:49 UTC (permalink / raw) To: 'Chris Mason'; +Cc: linux-btrfs Hi, Chris Mason > > > > > > > reada create 2 works for each level of tree in recursion. > > > > > > > > > > > > > > In case of a tree having many levels, the number of created > > > > > > > works is 2^level_of_tree. > > > > > > > Actually we don't need so many works in parallel, this patch > > > > > > > limit max works to BTRFS_MAX_MIRRORS * 2. > > > > > > > > > > > > Hi, > > > > > > > > > > > > I don't think you end up calling atomic_dec() for every time > > > > > > that > > > > > > reada_start_machine() is called. Also, I'd rather not have a > > > > > > global static variable to limit the parallel workers, when we > > > > > > have more than one FS mounted it'll end up limiting things too much. > > > > > > > > > > > > With this patch applied, I'm seeing deadlocks during btrfs/066. > You > > > > > > have to run the scrub tests as well, basically we're just > > > > > > getting fsstress run alongside scrub. > > > > > > > > > > > > I'll run a few more times with it reverted to make sure, but I > > > > > > think it's the root cause. > > > > > > > > > > I spoke too soon, it ended up deadlocking a few tests later. > > > > > > > > > In logic, even if the calculation of atomic_dec() in this patch > > > > having bug, in worst condition, reada will works in single-thread > > > > mode, and will not introduce deadlock. > > > > > > > > And by looking the backtrace in this mail, maybe it is caused by > > > > reada_control->elems in someplace of this patchset. > > > > > > > > I recheck xfstests/066 in both vm and physical machine, on top of > > > > my pull-request git today, with btrfs-progs 4.4 for many times, > > > > but had not > > > triggered the bug. > > > > > > Just running 066 alone doesn't trigger it for me. I have to run > > > everything from > > > 00->066. > > > > > > My setup is 5 drives. I use a script to carve them up into logical > > > volumes, 5 for the test device and 5 for the scratch pool. I think > > > it should reproduce with a single drive, if you still can't trigger I'll confirm > that. > > > > > > > > > > > Could you tell me your test environment(TEST_DEV size, mount > > > > option), and odds of fails in btrfs/066? > > > > > > 100% odds of failing, one time it made it up to btrfs/072. I think > > > more important than the drive setup is that I have all the debugging on. > > > CONFIG_DEBUG_PAGEALLOC, spinlock debugging, mutex debugging and > lock > > > dep enabled. > > > > > Thanks for your answer. > > > > But unfortunately I hadn't reproduce the dead_lock in above way today... > > Now I queued loop of above reproduce script in more nodes, and hopes > > it can happen in this weekend. > > > > And by reviewing code, I found a problem which can introduce similar > > bad result in logic, and made a patch for it. > > [PATCH] [RFC] btrfs: reada: avoid undone reada extents in > > btrfs_reada_wait > > > > Because it is only a problem in logic, but rarely happened, I only > > confirmed no-problem after patch applied. > > > > Sorry for increased your works, could you apply this patch and test is > > it works? > > No problem, I'll try the patch and see if I can get a more reliable way to > reproduce if it doesn't fix things. Thanks! > I rebased following branch: https://github.com/zhaoleidd/btrfs.git integration-4.5 With updated patch to fix btrfs/066 bug. Bug reason is descripted in changelog of: btrfs: reada: avoid undone reada extents in btrfs_reada_wait Test: 1: In the node which can repgoduce btrfs/066 bug, Confirmed HAVING_BUG before patch, and NO_BUG after patch. 2: Run xfstests's btrfs group, confirmed no regression. Most patchs in this branch are for reada, except this one for NO_SPACE bug: btrfs: Continue write in case of can_not_nocow Cound you consider merging it in suitable time? Thanks Zhaolei > -chris > ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/2] btrfs: reada: limit max works count 2016-01-28 7:49 ` Zhao Lei @ 2016-01-28 13:30 ` Chris Mason 0 siblings, 0 replies; 12+ messages in thread From: Chris Mason @ 2016-01-28 13:30 UTC (permalink / raw) To: Zhao Lei; +Cc: linux-btrfs On Thu, Jan 28, 2016 at 03:49:54PM +0800, Zhao Lei wrote: > I rebased following branch: > https://github.com/zhaoleidd/btrfs.git integration-4.5 > > With updated patch to fix btrfs/066 bug. > Bug reason is descripted in changelog of: > btrfs: reada: avoid undone reada extents in btrfs_reada_wait > > Test: > 1: In the node which can repgoduce btrfs/066 bug, > Confirmed HAVING_BUG before patch, and NO_BUG after patch. > 2: Run xfstests's btrfs group, confirmed no regression. > > Most patchs in this branch are for reada, except this one for NO_SPACE bug: > btrfs: Continue write in case of can_not_nocow > > Cound you consider merging it in suitable time? Thanks for tracking all of this down, I'll take a look. -chris ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2016-01-28 13:30 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2016-01-12 7:46 [PATCH 1/2] btrfs: reada: limit max works count Zhao Lei 2016-01-12 7:46 ` [PATCH 2/2] btrfs: reada: simplify dev->reada_in_flight processing Zhao Lei 2016-01-20 15:16 ` [PATCH 1/2] btrfs: reada: limit max works count Chris Mason 2016-01-20 17:48 ` Chris Mason 2016-01-21 3:36 ` Zhao Lei 2016-01-21 10:06 ` Zhao Lei 2016-01-21 14:14 ` Chris Mason 2016-01-22 12:25 ` Zhao Lei 2016-01-22 14:19 ` Chris Mason 2016-01-26 9:08 ` Zhao Lei 2016-01-28 7:49 ` Zhao Lei 2016-01-28 13:30 ` Chris Mason
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).