From mboxrd@z Thu Jan 1 00:00:00 1970 From: Josef Bacik Subject: Re: stalls with latest btrfs merge into 3.0-rc2 Date: Mon, 13 Jun 2011 20:47:20 -0400 Message-ID: <4DF6AF98.2050800@redhat.com> References: <4DF67C06.4000307@sandia.gov> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit Return-path: Received: from mx1.redhat.com ([209.132.183.28]:1025 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755536Ab1FNArV (ORCPT ); Mon, 13 Jun 2011 20:47:21 -0400 In-Reply-To: <4DF67C06.4000307@sandia.gov> Sender: ceph-devel-owner@vger.kernel.org List-ID: To: Jim Schutt Cc: linux-btrfs@vger.kernel.org, ceph-devel@vger.kernel.org On 06/13/2011 05:07 PM, Jim Schutt wrote: > Hi, > > On a system under a heavy write load from multiple ceph OSDs, > I'm running into the following hung tasks where btrfs is implicated. > I'm running commit 3c25fa740e2 from Linus' tree merged with > commit cb9b41c92fa from git://ceph.newdream.net/git/ceph-client.git. > > Let me know what else I can do to help sort this out. > > [ 961.318047] INFO: task kworker/1:2:2346 blocked for more than 120 > seconds. > [ 961.324993] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 961.332891] ffff8802253dbcd0 0000000000000046 ffff8802253dbcb0 > ffff88021c138000 > [ 961.340398] ffff880222c3dac0 ffff880222c3dac0 00000000000108c0 > ffff88021c138000 > [ 961.347893] ffff880222c3de80 0000000000000001 ffff8802253dbd00 > ffffffff813b0f69 > [ 961.355384] Call Trace: > [ 961.357838] [] schedule+0x164/0x19e > [ 961.363041] [] btrfs_start_ordered_extent+0xa8/0xc4 > [btrfs] > [ 961.370268] [] ? list_del_init+0x21/0x21 > [ 961.376075] [] btrfs_wait_ordered_extents+0xd8/0x143 > [btrfs] > [ 961.383387] [] btrfs_commit_transaction+0x20b/0x5a4 > [btrfs] > [ 961.390642] [] ? list_del_init+0x21/0x21 > [ 961.396284] [] do_async_commit+0x1f/0x2c [btrfs] > [ 961.402638] [] process_one_work+0x124/0x1e0 > [ 961.408478] [] ? > btrfs_commit_transaction+0x5a4/0x5a4 [btrfs] > [ 961.415917] [] ? destroy_workqueue+0x161/0x161 > [ 961.422155] [] worker_thread+0x8f/0x124 > [ 961.427642] [] kthread+0x72/0x7a > [ 961.432636] [] kernel_thread_helper+0x4/0x10 > [ 961.438561] [] ? retint_restore_args+0xe/0xe > [ 961.444530] [] ? kthread_bind+0x53/0x53 > [ 961.450100] [] ? gs_change+0xb/0xb > [ 961.455188] INFO: task btrfs-transacti:7653 blocked for more than 120 > seconds. > [ 961.462506] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 961.477512] ffff88021c24fd90 0000000000000046 ffff88021c24fd70 > ffff880190a65ac0 > [ 961.491735] ffff8802246cad60 ffff8802246cad60 00000000000108c0 > ffff880190a65ac0 > [ 961.505853] ffff8802246cb120 0000000000000001 ffff88021c24fdc0 > ffffffff813b0f69 > [ 961.521106] Call Trace: > [ 961.526052] [] schedule+0x164/0x19e > [ 961.531910] [] wait_current_trans+0xb8/0xec [btrfs] > [ 961.544101] [] ? list_del_init+0x21/0x21 > [ 961.550394] [] ? spin_lock+0xe/0x10 [btrfs] > [ 961.556471] [] start_transaction+0xd1/0x206 [btrfs] > [ 961.563023] [] ? > btree_readpage_end_io_hook+0x192/0x192 [btrfs] > [ 961.570597] [] btrfs_join_transaction+0x15/0x17 [btrfs] > [ 961.577609] [] transaction_kthread+0x154/0x22d [btrfs] > [ 961.584465] [] ? need_resched+0x23/0x2d > [ 961.589967] [] kthread+0x72/0x7a > [ 961.594843] [] kernel_thread_helper+0x4/0x10 > [ 961.600777] [] ? retint_restore_args+0xe/0xe > [ 961.606702] [] ? kthread_bind+0x53/0x53 > [ 961.612206] [] ? gs_change+0xb/0xb > [ 961.617322] INFO: task cosd:16719 blocked for more than 120 seconds. > [ 961.623702] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 961.631542] ffff8801bf163cb8 0000000000000086 ffff8801bf163c98 > ffff880226db16b0 > [ 961.639072] ffff8801df4f5ac0 ffff8801df4f5ac0 00000000000108c0 > ffff880226db16b0 > [ 961.646516] ffff8801df4f5e80 0000000000000001 ffff8801bf163ce8 > ffffffff813b0f69 > [ 961.653955] Call Trace: > [ 961.656483] [] schedule+0x164/0x19e > [ 961.661725] [] > wait_current_trans_commit_start_and_unblock+0xa9/0xce [btrfs] > [ 961.670461] [] ? list_del_init+0x21/0x21 > [ 961.676028] [] ? queue_work+0x1f/0x21 > [ 961.681436] [] > btrfs_commit_transaction_async+0xd3/0x115 [btrfs] > [ 961.689118] [] create_snapshot+0xe5/0x177 [btrfs] > [ 961.695546] [] btrfs_mksubvol+0xfa/0x167 [btrfs] > [ 961.702416] [] > btrfs_ioctl_snap_create_transid+0xff/0x121 [btrfs] > [ 961.710454] [] btrfs_ioctl_snap_create_v2+0x88/0xea > [btrfs] > [ 961.718000] [] btrfs_ioctl+0x208/0x358 [btrfs] > [ 961.724088] [] vfs_ioctl+0x1d/0x34 > [ 961.729159] [] do_vfs_ioctl+0x171/0x17a > [ 961.734732] [] ? fget_light+0x69/0x81 > [ 961.740057] [] sys_ioctl+0x5c/0x7c > [ 961.745333] [] ? jbd_free_handle+0x1b/0x1d [jbd] > [ 961.752028] [] system_call_fastpath+0x16/0x1b > [ 961.758429] INFO: task cosd:16720 blocked for more than 120 seconds. > [ 961.765416] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 961.776419] ffff8801bf123bb8 0000000000000086 ffff8801bf123b98 > ffff8801664bad60 > [ 961.792604] ffff8801df4f2d60 ffff8801df4f2d60 00000000000108c0 > ffff8801664bad60 > [ 961.814250] ffff8801df4f3120 0000000000000000 ffff8801bf123be8 > ffffffff813b0f69 > [ 961.825356] Call Trace: > [ 961.830932] [] schedule+0x164/0x19e > [ 961.837403] [] wait_current_trans+0xb8/0xec [btrfs] > [ 961.844413] [] ? list_del_init+0x21/0x21 > [ 961.850203] [] ? kmem_cache_alloc+0xad/0xb9 > [ 961.856255] [] start_transaction+0xd1/0x206 [btrfs] > [ 961.863351] [] btrfs_start_transaction+0x13/0x15 > [btrfs] > [ 961.870549] [] btrfs_create+0x3b/0x197 [btrfs] > [ 961.876889] [] vfs_create+0x72/0x92 > [ 961.882115] [] do_last+0x22c/0x40b > [ 961.887195] [] path_openat+0xc0/0x2ef > [ 961.892581] [] do_filp_open+0x3d/0x87 > [ 961.897888] [] ? strncpy_from_user+0x43/0x4d > [ 961.903880] [] ? getname_flags+0x2e/0x80 > [ 961.909463] [] ? do_getname+0x14b/0x173 > [ 961.915472] [] ? audit_getname+0x16/0x26 > [ 961.921259] [] ? spin_lock+0xe/0x10 > [ 961.926480] [] do_sys_open+0x69/0xae > [ 961.931759] [] sys_open+0x20/0x22 > [ 961.936730] [] system_call_fastpath+0x16/0x1b > [ 961.942878] INFO: task cosd:16721 blocked for more than 120 seconds. > [ 961.949231] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 961.957398] ffff8801502dfb38 0000000000000086 ffff8801502dfb18 > ffff8801dd19c410 > [ 961.965782] ffff880150314410 ffff880150314410 00000000000108c0 > ffff8801dd19c410 > [ 961.973784] ffff8801503147d0 0000000000400000 ffff8801502dfb68 > ffffffff813b0f69 > [ 961.981702] Call Trace: > [ 961.984239] [] schedule+0x164/0x19e > [ 961.989445] [] wait_current_trans+0xb8/0xec [btrfs] > [ 961.995993] [] ? list_del_init+0x21/0x21 > [ 962.001781] [] btrfs_throttle+0x1f/0x21 [btrfs] > [ 962.008040] [] __btrfs_buffered_write+0x29e/0x2dd > [btrfs] > [ 962.015217] [] btrfs_file_aio_write+0x203/0x2df [btrfs] > [ 962.022152] [] ? __btrfs_direct_write+0x12b/0x12b > [btrfs] > [ 962.029240] [] do_sync_readv_writev+0xc1/0x109 > [ 962.035448] [] ? mnt_dec_count+0x11/0x13 > [ 962.041016] [] ? need_resched+0x23/0x2d > [ 962.046796] [] ? should_resched+0xe/0x2f > [ 962.057128] [] ? security_file_permission+0x2c/0x31 > [ 962.068078] [] ? rw_verify_area+0xac/0xdb > [ 962.073949] [] do_readv_writev+0xbf/0x127 > [ 962.079741] [] ? getname_flags+0x2e/0x80 > [ 962.096720] [] ? compound_head+0x12/0x20 > [ 962.103380] [] ? need_resched+0x23/0x2d > [ 962.109497] [] ? should_resched+0xe/0x2f > [ 962.119919] [] ? _cond_resched+0xe/0x22 > [ 962.131022] [] ? mutex_lock+0x16/0x34 > [ 962.137370] [] vfs_writev+0x43/0x4e > [ 962.142531] [] sys_writev+0x4c/0x71 > [ 962.148853] [] system_call_fastpath+0x16/0x1b > [ 962.155452] INFO: task btrfs-endio-wri:23430 blocked for more than 120 > seconds. > [ 962.162885] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 962.170846] ffff8801dd943c20 0000000000000046 ffff8801dd943c00 > ffff8801df4f5ac0 > [ 962.178326] ffff880182060000 ffff880182060000 00000000000108c0 > ffff8801df4f5ac0 > [ 962.185821] ffff8801820603c0 0000000000000001 ffff8801dd943c50 > ffffffff813b0f69 > [ 962.193386] Call Trace: > [ 962.195837] [] schedule+0x164/0x19e > [ 962.201044] [] wait_current_trans+0xb8/0xec [btrfs] > [ 962.208020] [] ? list_del_init+0x21/0x21 > [ 962.213751] [] ? spin_lock+0xe/0x10 [btrfs] > [ 962.219713] [] start_transaction+0xd1/0x206 [btrfs] > [ 962.226247] [] btrfs_join_transaction+0x15/0x17 [btrfs] > [ 962.233127] [] btrfs_finish_ordered_io+0x135/0x2c1 > [btrfs] > [ 962.240308] [] btrfs_writepage_end_io_hook+0x3d/0x45 > [btrfs] > [ 962.247648] [] end_bio_extent_writepage+0xaa/0x15a > [btrfs] > [ 962.255291] [] bio_endio+0x2d/0x2f > [ 962.260613] [] end_workqueue_fn+0x72/0x7b [btrfs] > [ 962.267510] [] worker_loop+0x60/0x1f6 [btrfs] > [ 962.273543] [] ? calc_load_account_idle+0xe/0x1d > [ 962.279841] [] ? run_ordered_completions+0x99/0x99 > [btrfs] > [ 962.286968] [] kthread+0x72/0x7a > [ 962.291965] [] kernel_thread_helper+0x4/0x10 > [ 962.297888] [] ? retint_restore_args+0xe/0xe > [ 962.303801] [] ? kthread_bind+0x53/0x53 > [ 962.309278] [] ? gs_change+0xb/0xb > Ugh that's not good. I'll look at it tomorrow and send you something to test. Thanks, Josef