From mboxrd@z Thu Jan 1 00:00:00 1970 From: Sasha Levin Subject: mm,numad,rcu: hang on OOM Date: Fri, 29 Jun 2012 18:44:41 +0200 Message-ID: <1340988281.2936.58.camel@lappy> Mime-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit Return-path: Sender: linux-kernel-owner@vger.kernel.org To: paulmck , Andrew Morton , Ingo Molnar , Peter Zijlstra Cc: linux-mm , "linux-kernel@vger.kernel.org" List-Id: linux-mm.kvack.org Hi all, While fuzzing using trinity on a KVM tools guest with todays linux-next, I've hit the following lockup: [ 362.261729] INFO: task numad/2:27 blocked for more than 120 seconds. [ 362.263974] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 362.271684] numad/2 D 0000000000000001 5672 27 2 0x00000000 [ 362.280052] ffff8800294c7c58 0000000000000046 ffff8800294c7c08 ffffffff81163dba [ 362.294477] ffff8800294c6000 ffff8800294c6010 ffff8800294c7fd8 ffff8800294c6000 [ 362.306631] ffff8800294c6010 ffff8800294c7fd8 ffff88000d5c3000 ffff8800294c8000 [ 362.315395] Call Trace: [ 362.318556] [] ? __lock_release+0x1ba/0x1d0 [ 362.325411] [] schedule+0x55/0x60 [ 362.328844] [] rwsem_down_failed_common+0xf5/0x130 [ 362.332501] [] ? put_lock_stats+0xe/0x40 [ 362.334496] [] ? __lock_contended+0x1f5/0x230 [ 362.336723] [] rwsem_down_read_failed+0x15/0x17 [ 362.339297] [] call_rwsem_down_read_failed+0x14/0x30 [ 362.341768] [] ? down_read+0x79/0xa0 [ 362.343669] [] ? lazy_migrate_process+0x22/0x60 [ 362.345616] [] lazy_migrate_process+0x22/0x60 [ 362.347464] [] process_mem_migrate+0x10/0x20 [ 362.349340] [] move_processes+0x190/0x230 [ 362.351398] [] numad_thread+0x7a/0x120 [ 362.353245] [] ? find_busiest_node+0x310/0x310 [ 362.355396] [] kthread+0xb2/0xc0 [ 362.356996] [] kernel_thread_helper+0x4/0x10 [ 362.359253] [] ? retint_restore_args+0x13/0x13 [ 362.361168] [] ? __init_kthread_worker+0x70/0x70 [ 362.363277] [] ? gs_change+0x13/0x13 I've hit sysrq-t to see what might be the cause, and it appears that an OOM was in progress, and was stuck on RCU: [ 578.086230] trinity-child69 D ffff8800277a54c8 3968 6658 6580 0x00000000 [ 578.086230] ffff880022c5f518 0000000000000046 ffff880022c5f4c8 ffff88001b9d6e00 [ 578.086230] ffff880022c5e000 ffff880022c5e010 ffff880022c5ffd8 ffff880022c5e000 [ 578.086230] ffff880022c5e010 ffff880022c5ffd8 ffff880023c08000 ffff880022c33000 [ 578.086230] Call Trace: [ 578.086230] [] schedule+0x55/0x60 [ 578.086230] [] schedule_timeout+0x38/0x2c0 [ 578.086230] [] ? mark_held_locks+0xf6/0x120 [ 578.086230] [] ? __lock_release+0x1ba/0x1d0 [ 578.086230] [] ? _raw_spin_unlock_irq+0x2b/0x80 [ 578.086230] [] wait_for_common+0xff/0x170 [ 578.086230] [] ? try_to_wake_up+0x290/0x290 [ 578.086230] [] wait_for_completion+0x18/0x20 [ 578.086230] [] _rcu_barrier+0x4a7/0x4e0 [ 578.086230] [] ? sched_clock+0x1d/0x30 [ 578.086230] [] ? sched_clock_local+0x25/0x90 [ 578.086230] [] ? sched_clock_cpu+0x108/0x120 [ 578.086230] [] ? __lock_acquire+0x42c/0x4b0 [ 578.086230] [] ? rcu_barrier_func+0x70/0x70 [ 578.086230] [] ? put_lock_stats+0xe/0x40 [ 578.086230] [] ? __lock_acquired+0x2a4/0x2e0 [ 578.086230] [] rcu_barrier_bh+0x10/0x20 [ 578.086230] [] rcu_oom_notify+0x16/0x30 [ 578.086230] [] notifier_call_chain+0xee/0x130 [ 578.086230] [] __blocking_notifier_call_chain+0xa6/0xd0 [ 578.086230] [] blocking_notifier_call_chain+0x11/0x20 [ 578.086230] [] out_of_memory+0x44/0x240 [ 578.086230] [] ? _raw_spin_unlock+0x30/0x60 [ 578.086230] [] __alloc_pages_slowpath+0x55f/0x6a0 [ 578.086230] [] ? get_page_from_freelist+0x625/0x660 [ 578.086230] [] __alloc_pages_nodemask+0x246/0x330 [ 578.086230] [] alloc_pages_current+0xdd/0x110 [ 578.086230] [] __page_cache_alloc+0xc7/0xe0 [ 578.086230] [] filemap_fault+0x35f/0x4c0 [ 578.086230] [] __do_fault+0xae/0x560 [ 578.086230] [] handle_pte_fault+0x81/0x1f0 [ 578.086230] [] handle_mm_fault+0x329/0x350 [ 578.086230] [] do_page_fault+0x421/0x450 [ 578.086230] [] ? might_fault+0x4e/0xa0 [ 578.086230] [] ? might_fault+0x4e/0xa0 [ 578.086230] [] ? __lock_release+0x1ba/0x1d0 [ 578.086230] [] ? might_fault+0x4e/0xa0 [ 578.086230] [] do_async_page_fault+0x31/0xb0 [ 578.086230] [] async_page_fault+0x25/0x30 Other than that, there are several threads stuck in hugepage related code trying to allocate: [ 578.086230] trinity-child72 D ffff880022cd84c8 3264 6661 6580 0x00000004 [ 578.086230] ffff880022ccd848 0000000000000046 ffff880022ccd7f8 ffffffff81163dba [ 578.086230] ffff880022ccc000 ffff880022ccc010 ffff880022ccdfd8 ffff880022ccc000 [ 578.086230] ffff880022ccc010 ffff880022ccdfd8 ffff880027733000 ffff880022cd0000 [ 578.086230] Call Trace: [ 578.086230] [] ? __lock_release+0x1ba/0x1d0 [ 578.086230] [] schedule+0x55/0x60 [ 578.086230] [] schedule_timeout+0x276/0x2c0 [ 578.086230] [] ? lock_timer_base+0x70/0x70 [ 578.086230] [] schedule_timeout_uninterruptible+0x19/0x20 [ 578.086230] [] __alloc_pages_slowpath+0x4ef/0x6a0 [ 578.086230] [] ? get_page_from_freelist+0x625/0x660 [ 578.086230] [] __alloc_pages_nodemask+0x246/0x330 [ 578.086230] [] alloc_pages_current+0xdd/0x110 [ 578.086230] [] pte_alloc_one+0x16/0x40 [ 578.086230] [] __pte_alloc+0x2d/0x1e0 [ 578.086230] [] do_huge_pmd_anonymous_page+0x151/0x230 [ 578.086230] [] handle_mm_fault+0x1e3/0x350 [ 578.086230] [] ? follow_page+0xe7/0x5a0 [ 578.086230] [] __get_user_pages+0x438/0x5d0 [ 578.086230] [] __mlock_vma_pages_range+0xc6/0xd0 [ 578.086230] [] mlock_vma_pages_range+0x75/0xb0 [ 578.086230] [] mmap_region+0x4bc/0x5f0 [ 578.086230] [] do_mmap_pgoff+0x2b9/0x350 [ 578.086230] [] ? vm_mmap_pgoff+0x6c/0xb0 [ 578.086230] [] vm_mmap_pgoff+0x84/0xb0 [ 578.086230] [] sys_mmap_pgoff+0x182/0x190 [ 578.086230] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 578.086230] [] sys_mmap+0x1d/0x20 [ 578.086230] [] system_call_fastpath+0x16/0x1b And with one, trying to do the following: [ 578.086230] trinity-child70 R running task 3440 6659 6580 0x00000004 [ 578.086230] ffff880022c7f5e8 0000000000000046 ffff880022c7f5b8 ffffffff81161d16 [ 578.086230] ffff880022c7e000 ffff880022c7e010 ffff880022c7ffd8 ffff880022c7e000 [ 578.086230] ffff880022c7e010 ffff880022c7ffd8 ffff880028e13000 ffff880022c80000 [ 578.086230] Call Trace: [ 578.086230] [] ? mark_held_locks+0xf6/0x120 [ 578.086230] [] preempt_schedule_irq+0x94/0xd0 [ 578.086230] [] retint_kernel+0x26/0x30 [ 578.086230] [] ? shrink_zcache_memory+0xe5/0x110 [ 578.086230] [] shrink_slab+0xd0/0x520 [ 578.086230] [] ? shrink_zones+0x1f0/0x220 [ 578.086230] [] do_try_to_free_pages+0x1c9/0x3e0 [ 578.086230] [] try_to_free_pages+0x143/0x200 [ 578.086230] [] ? _raw_spin_unlock_irqrestore+0x65/0xc0 [ 578.086230] [] __perform_reclaim+0x8b/0xe0 [ 578.086230] [] __alloc_pages_slowpath+0x407/0x6a0 [ 578.086230] [] ? get_page_from_freelist+0x625/0x660 [ 578.086230] [] __alloc_pages_nodemask+0x246/0x330 [ 578.086230] [] alloc_pages_current+0xdd/0x110 [ 578.086230] [] pte_alloc_one+0x16/0x40 [ 578.086230] [] __pte_alloc+0x2d/0x1e0 [ 578.086230] [] do_huge_pmd_anonymous_page+0x151/0x230 [ 578.086230] [] handle_mm_fault+0x1e3/0x350 [ 578.086230] [] ? follow_page+0xe7/0x5a0 [ 578.086230] [] __get_user_pages+0x438/0x5d0 [ 578.086230] [] __mlock_vma_pages_range+0xc6/0xd0 [ 578.086230] [] mlock_vma_pages_range+0x75/0xb0 [ 578.086230] [] mmap_region+0x4bc/0x5f0 [ 578.086230] [] do_mmap_pgoff+0x2b9/0x350 [ 578.086230] [] ? vm_mmap_pgoff+0x6c/0xb0 [ 578.086230] [] vm_mmap_pgoff+0x84/0xb0 [ 578.086230] [] sys_mmap_pgoff+0x182/0x190 [ 578.086230] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 578.086230] [] sys_mmap+0x1d/0x20 [ 578.086230] [] system_call_fastpath+0x16/0x1b The rest of the threads weren't particularly interesting, so I guess that the problem in one of the above.