Hi Neil, We've found md hang in our test, it's easy to reproduce with script attached. We've tried 3.4 stable kernel and latest mainline, it still exists. Looks like flush bdi_writeback_workfn race with md_stop, no idea how to fix it, could you kindly give us suggestions? Best regards, Jack [ 186.777410] [ 241.951933] INFO: task kworker/u12:3:247 blocked for more than 120 seconds. [ 241.952001] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 241.952075] kworker/u12:3 D 0000000000000000 0 247 2 0x00000000 [ 241.952203] Workqueue: writeback (flush-9:1) [ 241.952319] ffff88020d331418 0000000000000046 0000000000001000 ffff88020d330000 [ 241.952512] ffff88020d331fd8 ffff88020d330000 ffff88020d330010 ffff88020d330000 [ 241.952701] ffff88020d331fd8 ffff88020d330000 ffff88020c10b7e0 ffff8802158ddd20 [ 241.952891] Call Trace: [ 241.952951] [] schedule+0x24/0x70 [ 241.953022] [] md_write_start+0xad/0x1d0 [md_mod] [ 241.953083] [] ? wake_up_bit+0x40/0x40 [ 241.953144] [] make_request+0x5f/0xe10 [raid1] [ 241.953204] [] ? blk_throtl_bio+0x114/0x580 [ 241.953264] [] ? sched_clock_cpu+0xc5/0x100 [ 241.953325] [] ? __lock_acquire+0x2be/0x780 [ 241.953384] [] ? sched_clock_cpu+0xc5/0x100 [ 241.953451] [] ? md_make_request+0x141/0x340 [md_mod] [ 241.953519] [] ? md_make_request+0x141/0x340 [md_mod] [ 241.953587] [] md_make_request+0x183/0x340 [md_mod] [ 241.953655] [] ? md_make_request+0x50/0x340 [md_mod] [ 241.953716] [] ? mempool_alloc_slab+0x10/0x20 [ 241.953774] [] ? mempool_alloc+0x5b/0x170 [ 241.953834] [] generic_make_request+0xc2/0x100 [ 241.953893] [] submit_bio+0x76/0x160 [ 241.954392] [] ? bio_alloc_bioset+0x9c/0x1c0 [ 241.954451] [] _submit_bh+0x140/0x200 [ 241.954510] [] submit_bh+0xb/0x10 [ 241.954568] [] __block_write_full_page+0x1cf/0x320 [ 241.954629] [] ? find_get_pages_tag+0x116/0x1e0 [ 241.954689] [] ? block_invalidatepage+0x140/0x140 [ 241.954748] [] ? I_BDEV+0x10/0x10 [ 241.954804] [] ? I_BDEV+0x10/0x10 [ 241.954862] [] block_write_full_page_endio+0xc6/0x100 [ 241.954924] [] block_write_full_page+0x10/0x20 [ 241.954983] [] blkdev_writepage+0x13/0x20 [ 241.955041] [] __writepage+0x15/0x40 [ 241.955099] [] write_cache_pages+0x26d/0x540 [ 241.955159] [] ? set_page_dirty+0x60/0x60 [ 241.955219] [] generic_writepages+0x48/0x60 [ 241.955278] [] do_writepages+0x1e/0x40 [ 241.955335] [] __writeback_single_inode+0x44/0x2b0 [ 241.955395] [] writeback_sb_inodes+0x376/0x570 [ 241.955456] [] ? _raw_spin_unlock+0x26/0x40 [ 241.955513] [] __writeback_inodes_wb+0x96/0xc0 [ 241.955571] [] wb_writeback+0x223/0x330 [ 241.955630] [] wb_do_writeback+0x11a/0x250 [ 241.955688] [] bdi_writeback_workfn+0x80/0x200 [ 241.955748] [] process_one_work+0x1e6/0x5d0 [ 241.955806] [] ? process_one_work+0x171/0x5d0 [ 241.955865] [] worker_thread+0x11e/0x3e0 [ 241.955923] [] ? manage_workers+0x2b0/0x2b0 [ 241.955981] [] kthread+0xee/0x100 [ 241.956040] [] ? __init_kthread_worker+0x70/0x70 [ 241.956100] [] ret_from_fork+0x7c/0xb0 [ 241.956156] [] ? __init_kthread_worker+0x70/0x70 [ 241.956214] 3 locks held by kworker/u12:3/247: [ 241.956266] #0: (writeback){......}, at: [] process_one_work+0x171/0x5d0 [ 241.956486] #1: ((&(&wb->dwork)->work)){......}, at: [] process_one_work+0x171/0x5d0 [ 241.956706] #2: (&type->s_umount_key#21){......}, at: [] grab_super_passive+0x3e/0x90 [ 241.956975] INFO: task mdadm:2902 blocked for more than 120 seconds. [ 241.957030] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 241.957138] mdadm D 0000000000000000 0 2902 2885 0x00000004 [ 241.957255] ffff8802117f95e8 0000000000000046 0000000000001000 ffff8802117f8000 [ 241.957443] ffff8802117f9fd8 ffff8802117f8000 ffff8802117f8010 ffff8802117f8000 [ 241.957632] ffff8802117f9fd8 ffff8802117f8000 ffff88020c1dca80 ffff8802158ddd20 [ 241.957819] Call Trace: [ 241.957876] [] schedule+0x24/0x70 [ 241.957941] [] md_write_start+0xad/0x1d0 [md_mod] [ 241.958000] [] ? wake_up_bit+0x40/0x40 [ 241.958059] [] make_request+0x5f/0xe10 [raid1] [ 241.958119] [] ? blk_throtl_bio+0x114/0x580 [ 241.958179] [] ? sched_clock_cpu+0xc5/0x100 [ 241.958238] [] ? __lock_acquire+0x2be/0x780 [ 241.958297] [] ? sched_clock_cpu+0xc5/0x100 [ 241.958365] [] ? md_make_request+0x141/0x340 [md_mod] [ 241.958433] [] ? md_make_request+0x141/0x340 [md_mod] [ 241.958501] [] md_make_request+0x183/0x340 [md_mod] [ 241.958568] [] ? md_make_request+0x50/0x340 [md_mod] [ 241.958627] [] ? mempool_alloc_slab+0x10/0x20 [ 241.958685] [] ? mempool_alloc+0x5b/0x170 [ 241.958743] [] generic_make_request+0xc2/0x100 [ 241.958802] [] submit_bio+0x76/0x160 [ 241.958859] [] ? bio_alloc_bioset+0x9c/0x1c0 [ 241.958920] [] _submit_bh+0x140/0x200 [ 241.958978] [] submit_bh+0xb/0x10 [ 241.959036] [] __block_write_full_page+0x1cf/0x320 [ 241.959096] [] ? find_get_pages_tag+0x116/0x1e0 [ 241.959157] [] ? block_invalidatepage+0x140/0x140 [ 241.959215] [] ? I_BDEV+0x10/0x10 [ 241.959272] [] ? I_BDEV+0x10/0x10 [ 241.959330] [] block_write_full_page_endio+0xc6/0x100 [ 241.959391] [] block_write_full_page+0x10/0x20 [ 241.959449] [] blkdev_writepage+0x13/0x20 [ 241.959507] [] __writepage+0x15/0x40 [ 241.959566] [] write_cache_pages+0x26d/0x540 [ 241.959625] [] ? update_sd_lb_stats+0x133/0x670 [ 241.959685] [] ? set_page_dirty+0x60/0x60 [ 241.959745] [] generic_writepages+0x48/0x60 [ 241.959805] [] do_writepages+0x1e/0x40 [ 241.959864] [] __filemap_fdatawrite_range+0x51/0x60 [ 241.959925] [] filemap_fdatawrite+0x1a/0x20 [ 241.959985] [] filemap_write_and_wait+0x5d/0x80 [ 241.960044] [] __sync_blockdev+0x1c/0x40 [ 241.960102] [] sync_blockdev+0xe/0x10 [ 241.960167] [] do_md_stop+0x74/0x4e0 [md_mod] [ 241.960235] [] md_ioctl+0x784/0x16a0 [md_mod] [ 241.960294] [] ? sched_clock_cpu+0xc5/0x100 [ 241.960356] [] ? hrtimer_try_to_cancel+0x43/0xf0 [ 241.960416] [] __blkdev_driver_ioctl+0x23/0x30 [ 241.960476] [] blkdev_ioctl+0x21c/0x800 [ 241.960533] [] block_ioctl+0x3d/0x50 [ 241.960592] [] do_vfs_ioctl+0x9c/0x560 [ 241.960649] [] ? update_rmtp+0x80/0x80 [ 241.960709] [] ? hrtimer_start_range_ns+0xf/0x20 [ 241.960771] [] SyS_ioctl+0x91/0xa0 [ 241.960831] [] ? lockdep_sys_exit_thunk+0x35/0x67 [ 241.960897] [] system_call_fastpath+0x16/0x1b [ 241.960954] 2 locks held by mdadm/2902: [ 241.961004] #0: (&mddev->reconfig_mutex){......}, at: [] md_ioctl+0xee/0x16a0 [md_mod] [ 241.961235] #1: (&mddev->open_mutex){......}, at: [] do_md_stop+0x42/0x4e0 [md_mod] [ 361.888286] INFO: task kworker/u12:3:247 blocked for more than 120 seconds. [ 361.888389] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 361.888499] kworker/u12:3 D 0000000000000000 0 247 2 0x00000000 [ 361.888628] Workqueue: writeback bdi_writeback_workfn (flush-9:1) [ 361.888742] ffff88020d331418 0000000000000046 0000000000001000 ffff88020d330000 [ 361.888932] ffff88020d331fd8 ffff88020d330000 ffff88020d330010 ffff88020d330000 [ 361.889121] ffff88020d331fd8 ffff88020d330000 ffff88020c10b7e0 ffff8802158ddd20 [ 361.889308] Call Trace: [ 361.889368] [] schedule+0x24/0x70 [ 361.889438] [] md_write_start+0xad/0x1d0 [md_mod] [ 361.889499] [] ? wake_up_bit+0x40/0x40 [ 361.889560] [] make_request+0x5f/0xe10 [raid1] [ 361.889620] [] ? blk_throtl_bio+0x114/0x580 [ 361.889681] [] ? sched_clock_cpu+0xc5/0x100 [ 361.889741] [] ? __lock_acquire+0x2be/0x780 [ 361.889802] [] ? sched_clock_cpu+0xc5/0x100 [ 361.889870] [] ? md_make_request+0x141/0x340 [md_mod] [ 361.889937] [] ? md_make_request+0x141/0x340 [md_mod] [ 361.890005] [] md_make_request+0x183/0x340 [md_mod] [ 361.890072] [] ? md_make_request+0x50/0x340 [md_mod] [ 361.890133] [] ? mempool_alloc_slab+0x10/0x20 [ 361.890191] [] ? mempool_alloc+0x5b/0x170 [ 361.890251] [] generic_make_request+0xc2/0x100 [ 361.890310] [] submit_bio+0x76/0x160 [ 361.890369] [] ? bio_alloc_bioset+0x9c/0x1c0 [ 361.890428] [] _submit_bh+0x140/0x200 [ 361.890486] [] submit_bh+0xb/0x10 [ 361.890545] [] __block_write_full_page+0x1cf/0x320 [ 361.890606] [] ? find_get_pages_tag+0x116/0x1e0 [ 361.890666] [] ? block_invalidatepage+0x140/0x140 [ 361.890724] [] ? I_BDEV+0x10/0x10 [ 361.890781] [] ? I_BDEV+0x10/0x10 [ 361.890839] [] block_write_full_page_endio+0xc6/0x100 [ 361.890899] [] block_write_full_page+0x10/0x20 [ 361.890958] [] blkdev_writepage+0x13/0x20 [ 361.891017] [] __writepage+0x15/0x40 [ 361.891076] [] write_cache_pages+0x26d/0x540 [ 361.891135] [] ? set_page_dirty+0x60/0x60 [ 361.891195] [] generic_writepages+0x48/0x60 [ 361.891255] [] do_writepages+0x1e/0x40 [ 361.891312] [] __writeback_single_inode+0x44/0x2b0 [ 361.891371] [] writeback_sb_inodes+0x376/0x570 [ 361.891431] [] ? _raw_spin_unlock+0x26/0x40 [ 361.891490] [] __writeback_inodes_wb+0x96/0xc0 [ 361.891548] [] wb_writeback+0x223/0x330 [ 361.891606] [] wb_do_writeback+0x11a/0x250 [ 361.891665] [] bdi_writeback_workfn+0x80/0x200 [ 361.891725] [] process_one_work+0x1e6/0x5d0 [ 361.891784] [] ? process_one_work+0x171/0x5d0 [ 361.891843] [] worker_thread+0x11e/0x3e0 [ 361.891902] [] ? manage_workers+0x2b0/0x2b0 [ 361.891959] [] kthread+0xee/0x100 [ 361.892017] [] ? __init_kthread_worker+0x70/0x70 [ 361.892078] [] ret_from_fork+0x7c/0xb0 [ 361.892135] [] ? __init_kthread_worker+0x70/0x70 [ 361.892193] 3 locks held by kworker/u12:3/247: [ 361.892244] #0: (writeback){......}, at: [] process_one_work+0x171/0x5d0 [ 361.892464] #1: ((&(&wb->dwork)->work)){......}, at: [] process_one_work+0x171/0x5d0 [ 361.892687] #2: (&type->s_umount_key#21){......}, at: [] grab_super_passive+0x3e/0x90 [ 361.892956] INFO: task mdadm:2902 blocked for more than 120 seconds. [ 361.893011] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 361.893119] mdadm D 0000000000000000 0 2902 2885 0x00000004 [ 361.893236] ffff8802117f95e8 0000000000000046 0000000000001000 ffff8802117f8000 [ 361.893423] ffff8802117f9fd8 ffff8802117f8000 ffff8802117f8010 ffff8802117f8000 [ 361.893611] ffff8802117f9fd8 ffff8802117f8000 ffff88020c1dca80 ffff8802158ddd20 [ 361.894239] Call Trace: [ 361.894294] [] schedule+0x24/0x70 [ 361.894360] [] md_write_start+0xad/0x1d0 [md_mod] [ 361.894419] [] ? wake_up_bit+0x40/0x40 [ 361.894478] [] make_request+0x5f/0xe10 [raid1] [ 361.894536] [] ? blk_throtl_bio+0x114/0x580 [ 361.894596] [] ? sched_clock_cpu+0xc5/0x100 [ 361.894655] [] ? __lock_acquire+0x2be/0x780 [ 361.894714] [] ? sched_clock_cpu+0xc5/0x100 [ 361.894781] [] ? md_make_request+0x141/0x340 [md_mod] [ 361.894849] [] ? md_make_request+0x141/0x340 [md_mod] [ 361.894917] [] md_make_request+0x183/0x340 [md_mod] [ 361.894984] [] ? md_make_request+0x50/0x340 [md_mod] [ 361.895043] [] ? mempool_alloc_slab+0x10/0x20 [ 361.895101] [] ? mempool_alloc+0x5b/0x170 [ 361.895161] [] generic_make_request+0xc2/0x100 [ 361.895220] [] submit_bio+0x76/0x160 [ 361.895277] [] ? bio_alloc_bioset+0x9c/0x1c0 [ 361.895337] [] _submit_bh+0x140/0x200 [ 361.895395] [] submit_bh+0xb/0x10 [ 361.895453] [] __block_write_full_page+0x1cf/0x320 [ 361.895513] [] ? find_get_pages_tag+0x116/0x1e0 [ 361.895573] [] ? block_invalidatepage+0x140/0x140 [ 361.895632] [] ? I_BDEV+0x10/0x10 [ 361.895688] [] ? I_BDEV+0x10/0x10 [ 361.895746] [] block_write_full_page_endio+0xc6/0x100 [ 361.895808] [] block_write_full_page+0x10/0x20 [ 361.895866] [] blkdev_writepage+0x13/0x20 [ 361.895924] [] __writepage+0x15/0x40 [ 361.895981] [] write_cache_pages+0x26d/0x540 [ 361.896041] [] ? update_sd_lb_stats+0x133/0x670 [ 361.896100] [] ? set_page_dirty+0x60/0x60 [ 361.896159] [] generic_writepages+0x48/0x60 [ 361.896218] [] do_writepages+0x1e/0x40 [ 361.896278] [] __filemap_fdatawrite_range+0x51/0x60 [ 361.896338] [] filemap_fdatawrite+0x1a/0x20 [ 361.896397] [] filemap_write_and_wait+0x5d/0x80 [ 361.896456] [] __sync_blockdev+0x1c/0x40 [ 361.896515] [] sync_blockdev+0xe/0x10 [ 361.896580] [] do_md_stop+0x74/0x4e0 [md_mod] [ 361.896647] [] md_ioctl+0x784/0x16a0 [md_mod] [ 361.896707] [] ? sched_clock_cpu+0xc5/0x100 [ 361.896767] [] ? hrtimer_try_to_cancel+0x43/0xf0 [ 361.896828] [] __blkdev_driver_ioctl+0x23/0x30 [ 361.896886] [] blkdev_ioctl+0x21c/0x800 [ 361.896943] [] block_ioctl+0x3d/0x50 [ 361.897001] [] do_vfs_ioctl+0x9c/0x560 [ 361.897059] [] ? update_rmtp+0x80/0x80 [ 361.897116] [] ? hrtimer_start_range_ns+0xf/0x20 [ 361.897175] [] SyS_ioctl+0x91/0xa0 [ 361.897233] [] ? lockdep_sys_exit_thunk+0x35/0x67 [ 361.897293] [] system_call_fastpath+0x16/0x1b [ 361.897350] 2 locks held by mdadm/2902: [ 361.897401] #0: (&mddev->reconfig_mutex){......}, at: [] md_ioctl+0xee/0x16a0 [md_mod] [ 361.897631] #1: (&mddev->open_mutex){......}, at: [] do_md_stop+0x42/0x4e0 [md_mod]