* 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing @ 2010-10-25 9:22 Denys Fedoryshchenko 2010-10-28 7:05 ` Jarek Poplawski 0 siblings, 1 reply; 7+ messages in thread From: Denys Fedoryshchenko @ 2010-10-25 9:22 UTC (permalink / raw) To: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev Hi Here is what i got from netconsole [ 259.238755] BUG: unable to handle kernel paging request at f8ba001c [ 259.238953] IP: [<c0199ebe>] do_select+0x2cc/0x502 [ 259.239008] *pdpt = 0000000000a55001 *pde = 0000000036dd6067 *pte = 0000000000000000 [ 259.239008] Oops: 0000 [#1] SMP [ 259.239008] last sysfs file: /sys/devices/pnp0/00:02/rtc/rtc0/dev [ 259.239008] Modules linked in: rtc_cmos rtc_core rtc_lib act_skbedit sch_ingress sch_prio configfs cls_flow cls_u32 em_meta cls_basic xt_dscp ipt_REJECT ts_bm xt_string xt_hl ifb cls_fw sch_tbf sch_htb act_ipt act_mirred pppoe pppox ppp_generic slhc ipt_REDIRECT ipt_MASQUERADE xt_TCPMSS xt_DSCP xt_mark xt_tcpudp iptable_mangle iptable_nat nf_nat nf_conntrack_ipv4 nf_conntrack nf_defrag_ipv4 iptable_filter ip_tables x_tables 8021q garp stp llc loop usb_storage iTCO_wdt iTCO_vendor_support ata_generic pata_acpi ata_piix libata 8139cp sr_mod cdrom tulip r8169 sky2 via_velocity via_rhine sis900 ne2k_pci 8390 skge tg3 libphy 8139too e1000 e100 usbhid ohci_hcd uhci_hcd ehci_hcd usbcore nls_base [ 259.239008] [ 259.239008] Pid: 3307, comm: pppd Not tainted 2.6.36-build-0053 #14 D915GAV / [ 259.239008] EIP: 0060:[<c0199ebe>] EFLAGS: 00010286 CPU: 0 [ 259.239008] EIP is at do_select+0x2cc/0x502 [ 259.239008] EAX: f610a600 EBX: 00000080 ECX: f8ba0000 EDX: f60f1808 [ 259.239008] ESI: 00000000 EDI: f610a600 EBP: f60eae4c ESP: f60eab64 [ 259.239008] DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 [ 259.239008] Process pppd (pid: 3307, ti=f60ea000 task=f66a0000 task.ti=f60ea000) [ 259.239008] Stack: [ 259.239008] 0e95bb42 0000003d 000000d0 00000000 f60eaf60 f66a0000 f60eae6c f60eae70 [ 259.239008] <0> f60eae74 f60eae60 f60eae64 f60eae68 00000180 00000020 010eabdc 002dc698 [ 259.239008] <0> f60eae34 00000180 00000000 00000180 00000000 00000000 00000000 00000000 [ 259.239008] Call Trace: [ 259.239008] [<c019a484>] ? __pollwait+0x0/0xa5 [ 259.239008] [<c019a529>] ? pollwake+0x0/0x63 [ 259.239008] [<c019a529>] ? pollwake+0x0/0x63 [ 259.239008] [<c0143f37>] ? sched_clock_local+0x17/0x13d [ 259.239008] [<c0122ac4>] ? update_curr+0x84/0x15c [ 259.239008] [<c011f5a3>] ? kmap_atomic_prot+0xe1/0xe3 [ 259.239008] [<c0143f37>] ? sched_clock_local+0x17/0x13d [ 259.239008] [<c02044ee>] ? put_dec+0x2a/0x74 [ 259.239008] [<c0204689>] ? number+0x151/0x223 [ 259.239008] [<c012461f>] ? dequeue_task_fair+0x283/0x28b [ 259.239008] [<c0123845>] ? __dequeue_entity+0x23/0x27 [ 259.239008] [<c01238d6>] ? set_next_entity+0x8d/0xf7 [ 259.239008] [<c012889c>] ? finish_task_switch+0x33/0x7a [ 259.239008] [<c02f931c>] ? schedule+0x5c0/0x5ff [ 259.239008] [<c019a208>] ? core_sys_select+0x114/0x190 [ 259.239008] [<c015ce1c>] ? call_rcu_sched+0xd/0xf [ 259.239008] [<c015ce26>] ? call_rcu+0x8/0xa [ 259.239008] [<c012f80b>] ? release_task+0x28d/0x29d [ 259.239008] [<c012fc80>] ? wait_consider_task+0x465/0x7b7 [ 259.239008] [<c013fa7f>] ? spin_unlock_irqrestore+0x8/0xa [ 259.239008] [<c013fc10>] ? remove_wait_queue+0x31/0x36 [ 259.239008] [<f8bacff1>] ? ppp_read+0x101/0x145 [ppp_generic] [ 259.239008] [<c0145ceb>] ? timekeeping_get_ns+0x11/0x4f [ 259.239008] [<c014688a>] ? ktime_get_ts+0x89/0x93 [ 259.239008] [<c019a42e>] ? sys_select+0x68/0x84 [ 259.239008] [<c02fa625>] ? syscall_call+0x7/0xb [ 259.239008] Code: 85 9d 48 fd ff ff 0f 84 cb 00 00 00 8b 85 7c fd ff ff 8d 55 f0 e8 c1 55 ff ff 85 c0 89 c7 0f 84 b3 00 00 00 8b 48 10 85 c9 74 3a 79 1c 00 74 34 85 f6 74 25 c7 46 04 02 00 00 00 85 9d 5c fd [ 259.239008] EIP: [<c0199ebe>] do_select+0x2cc/0x502 SS:ESP 0068:f60eab64 [ 259.239008] CR2: 00000000f8ba001c [ 259.239008] ---[ end trace a6117b9e067aeb87 ]--- [ 259.239008] Kernel panic - not syncing: Fatal exception [ 259.239008] Pid: 3307, comm: pppd Tainted: G D 2.6.36-build-0053 #14 [ 259.239008] Call Trace: [ 259.239008] [<c02f8ace>] ? printk+0xf/0x11 [ 259.239008] [<c02f89cb>] panic+0x50/0x144 [ 259.239008] [<c0105061>] oops_end+0x8b/0x9a [ 259.239008] [<c011af70>] no_context+0x13e/0x148 [ 259.239008] [<c0142b8c>] ? hrtimer_try_to_cancel+0x60/0x69 [ 259.239008] [<c011b05f>] __bad_area_nosemaphore+0xe5/0xed [ 259.239008] [<c011b074>] bad_area_nosemaphore+0xd/0x10 [ 259.239008] [<c011b2d6>] do_page_fault+0xf6/0x230 [ 259.239008] [<c011b1e0>] ? do_page_fault+0x0/0x230 [ 259.239008] [<c02fac22>] error_code+0x5a/0x60 [ 259.239008] [<c019007b>] ? grab_super+0x30/0x56 [ 259.239008] [<c011b1e0>] ? do_page_fault+0x0/0x230 [ 259.239008] [<c0199ebe>] ? do_select+0x2cc/0x502 [ 259.239008] [<c019a484>] ? __pollwait+0x0/0xa5 [ 259.239008] [<c019a529>] ? pollwake+0x0/0x63 [ 259.239008] [<c019a529>] ? pollwake+0x0/0x63 [ 259.239008] [<c0143f37>] ? sched_clock_local+0x17/0x13d [ 259.239008] [<c0122ac4>] ? update_curr+0x84/0x15c [ 259.239008] [<c011f5a3>] ? kmap_atomic_prot+0xe1/0xe3 [ 259.239008] [<c0143f37>] ? sched_clock_local+0x17/0x13d [ 259.239008] [<c02044ee>] ? put_dec+0x2a/0x74 [ 259.239008] [<c0204689>] ? number+0x151/0x223 [ 259.239008] [<c012461f>] ? dequeue_task_fair+0x283/0x28b [ 259.239008] [<c0123845>] ? __dequeue_entity+0x23/0x27 [ 259.239008] [<c01238d6>] ? set_next_entity+0x8d/0xf7 [ 259.239008] [<c012889c>] ? finish_task_switch+0x33/0x7a [ 259.239008] [<c02f931c>] ? schedule+0x5c0/0x5ff [ 259.239008] [<c019a208>] core_sys_select+0x114/0x190 [ 259.239008] [<c015ce1c>] ? call_rcu_sched+0xd/0xf [ 259.239008] [<c015ce26>] ? call_rcu+0x8/0xa [ 259.239008] [<c012f80b>] ? release_task+0x28d/0x29d [ 259.239008] [<c012fc80>] ? wait_consider_task+0x465/0x7b7 [ 259.239008] [<c013fa7f>] ? spin_unlock_irqrestore+0x8/0xa [ 259.239008] [<c013fc10>] ? remove_wait_queue+0x31/0x36 [ 259.239008] [<f8bacff1>] ? ppp_read+0x101/0x145 [ppp_generic] [ 259.239008] [<c0145ceb>] ? timekeeping_get_ns+0x11/0x4f [ 259.239008] [<c014688a>] ? ktime_get_ts+0x89/0x93 [ 259.239008] [<c019a42e>] sys_select+0x68/0x84 [ 259.239008] [<c02fa625>] syscall_call+0x7/0xb [ 259.239008] Rebooting in 5 seconds.. It is not easy to do full git bisect(it is semi-embedded distro), but i can try reversing particular commits, if someone can give idea which one, and can try debug patches. ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing 2010-10-25 9:22 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing Denys Fedoryshchenko @ 2010-10-28 7:05 ` Jarek Poplawski 2010-11-02 13:49 ` Denys Fedoryshchenko 0 siblings, 1 reply; 7+ messages in thread From: Jarek Poplawski @ 2010-10-28 7:05 UTC (permalink / raw) To: Denys Fedoryshchenko Cc: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev On 2010-10-25 11:22, Denys Fedoryshchenko wrote: > Hi > > Here is what i got from netconsole > [ 259.238755] BUG: unable to handle kernel > paging request > at f8ba001c > [ 259.238953] IP: > [<c0199ebe>] do_select+0x2cc/0x502 ... > It is not easy to do full git bisect(it is semi-embedded distro), but i can > try reversing particular commits, if someone can give idea which one, and can > try debug patches. Hi, Nothing concrete, but you might try reverting this one: http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=commitdiff;h=15fd0cd9a2ad24a78fbee369dec8ca660979d57e Jarek P. ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing 2010-10-28 7:05 ` Jarek Poplawski @ 2010-11-02 13:49 ` Denys Fedoryshchenko 2010-11-03 7:38 ` Jarek Poplawski 0 siblings, 1 reply; 7+ messages in thread From: Denys Fedoryshchenko @ 2010-11-02 13:49 UTC (permalink / raw) To: Jarek Poplawski; +Cc: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev I didn't try yet, but i enable more debugs and catch linked list corruption. Here is dumps from netconsole: http://www.nuclearcat.com/ll.txt http://www.nuclearcat.com/ll2.txt I have another PC, also fails to run 2.6.36, but netconsole don't give anything. Both PC's have strange issue with clock drifting away too much (on 2.6.35 and maybe even before). On Thursday 28 October 2010 10:05:50 Jarek Poplawski wrote: > On 2010-10-25 11:22, Denys Fedoryshchenko wrote: > > Hi > > > > Here is what i got from netconsole > > > > [ 259.238755] BUG: unable to handle kernel > > paging request > > at f8ba001c > > [ 259.238953] IP: > > [<c0199ebe>] do_select+0x2cc/0x502 > > ... > > > It is not easy to do full git bisect(it is semi-embedded distro), but i > > can try reversing particular commits, if someone can give idea which > > one, and can try debug patches. > > Hi, > Nothing concrete, but you might try reverting this one: > > http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=commi > tdiff;h=15fd0cd9a2ad24a78fbee369dec8ca660979d57e > > Jarek P. > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing 2010-11-02 13:49 ` Denys Fedoryshchenko @ 2010-11-03 7:38 ` Jarek Poplawski 2010-11-03 7:47 ` Denys Fedoryshchenko 0 siblings, 1 reply; 7+ messages in thread From: Jarek Poplawski @ 2010-11-03 7:38 UTC (permalink / raw) To: Denys Fedoryshchenko Cc: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev On Tue, Nov 02, 2010 at 03:49:47PM +0200, Denys Fedoryshchenko wrote: > I didn't try yet, but i enable more debugs and catch linked list corruption. It should be very useful but it seems there were no significant changes in ppp locking between 2.6.35 and .36 except the patch I mentioned, so it would be nice to check this first and try to fix it properly later. Jarek P. > > Here is dumps from netconsole: > http://www.nuclearcat.com/ll.txt > http://www.nuclearcat.com/ll2.txt > > I have another PC, also fails to run 2.6.36, but netconsole don't give > anything. > Both PC's have strange issue with clock drifting away too much (on 2.6.35 and > maybe even before). > > > On Thursday 28 October 2010 10:05:50 Jarek Poplawski wrote: > > On 2010-10-25 11:22, Denys Fedoryshchenko wrote: > > > Hi > > > > > > Here is what i got from netconsole > > > > > > [ 259.238755] BUG: unable to handle kernel > > > paging request > > > at f8ba001c > > > [ 259.238953] IP: > > > [<c0199ebe>] do_select+0x2cc/0x502 > > > > ... > > > > > It is not easy to do full git bisect(it is semi-embedded distro), but i > > > can try reversing particular commits, if someone can give idea which > > > one, and can try debug patches. > > > > Hi, > > Nothing concrete, but you might try reverting this one: > > > > http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=commi > > tdiff;h=15fd0cd9a2ad24a78fbee369dec8ca660979d57e > > > > Jarek P. > > -- > > To unsubscribe from this list: send the line "unsubscribe netdev" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing 2010-11-03 7:38 ` Jarek Poplawski @ 2010-11-03 7:47 ` Denys Fedoryshchenko 2010-11-03 8:02 ` Jarek Poplawski 0 siblings, 1 reply; 7+ messages in thread From: Denys Fedoryshchenko @ 2010-11-03 7:47 UTC (permalink / raw) To: Jarek Poplawski; +Cc: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev I try to reverse and got very weird lockups (no netconsole logs and no watchdog triggered reboot on that remote machine). I will try to cook something to reboot it, because it is very remote machine On Wednesday 03 November 2010 09:38:54 Jarek Poplawski wrote: > On Tue, Nov 02, 2010 at 03:49:47PM +0200, Denys Fedoryshchenko wrote: > > I didn't try yet, but i enable more debugs and catch linked list > > corruption. > > It should be very useful but it seems there were no significant changes > in ppp locking between 2.6.35 and .36 except the patch I mentioned, so > it would be nice to check this first and try to fix it properly later. > > Jarek P. > > > Here is dumps from netconsole: > > http://www.nuclearcat.com/ll.txt > > http://www.nuclearcat.com/ll2.txt > > > > I have another PC, also fails to run 2.6.36, but netconsole don't give > > anything. > > Both PC's have strange issue with clock drifting away too much (on 2.6.35 > > and maybe even before). > > > > On Thursday 28 October 2010 10:05:50 Jarek Poplawski wrote: > > > On 2010-10-25 11:22, Denys Fedoryshchenko wrote: > > > > Hi > > > > > > > > Here is what i got from netconsole > > > > > > > > [ 259.238755] BUG: unable to handle kernel > > > > paging request > > > > at f8ba001c9999 > > > > [ 259.238953] IP: > > > > [<c0199ebe>] do_select+0x2cc/0x502 > > > > > > ... > > > > > > > It is not easy to do full git bisect(it is semi-embedded distro), but > > > > i can try reversing particular commits, if someone can give idea > > > > which one, and can try debug patches. > > > > > > Hi, > > > Nothing concrete, but you might try reverting this one: > > > > > > http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=c > > > ommi tdiff;h=15fd0cd9a2ad24a78fbee369dec8ca660979d57e > > > > > > Jarek P. > > > -- > > > To unsubscribe from this list: send the line "unsubscribe netdev" in > > > the body of a message to majordomo@vger.kernel.org > > > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing 2010-11-03 7:47 ` Denys Fedoryshchenko @ 2010-11-03 8:02 ` Jarek Poplawski [not found] ` <201011031018.21178.nuclearcat@nuclearcat.com> 0 siblings, 1 reply; 7+ messages in thread From: Jarek Poplawski @ 2010-11-03 8:02 UTC (permalink / raw) To: Denys Fedoryshchenko Cc: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev On Wed, Nov 03, 2010 at 09:47:53AM +0200, Denys Fedoryshchenko wrote: > I try to reverse and got very weird lockups (no netconsole logs and no > watchdog triggered reboot on that remote machine). > I will try to cook something to reboot it, because it is very remote machine OK, I only wanted to know if reverting could be a fast fix. Since it isn't, please stay with 2.6.35 until there is some new idea (patch). Jarek P. ^ permalink raw reply [flat|nested] 7+ messages in thread
[parent not found: <201011031018.21178.nuclearcat@nuclearcat.com>]
* Re: 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing [not found] ` <201011031018.21178.nuclearcat@nuclearcat.com> @ 2010-11-03 8:59 ` Jarek Poplawski 0 siblings, 0 replies; 7+ messages in thread From: Jarek Poplawski @ 2010-11-03 8:59 UTC (permalink / raw) To: Denys Fedoryshchenko Cc: Thomas Gleixner, Paul Mackerras, linux-kernel, netdev On Wed, Nov 03, 2010 at 10:18:20AM +0200, Denys Fedoryshchenko wrote: > > On Wednesday 03 November 2010 10:02:58 Jarek Poplawski wrote: > > On Wed, Nov 03, 2010 at 09:47:53AM +0200, Denys Fedoryshchenko wrote: > > > I try to reverse and got very weird lockups (no netconsole logs and no > > > watchdog triggered reboot on that remote machine). > > > I will try to cook something to reboot it, because it is very remote > > > machine > > > > OK, I only wanted to know if reverting could be a fast fix. Since it > > isn't, please stay with 2.6.35 until there is some new idea (patch). > > > Well, still i want to try (if i can) more debug, and maybe i'll catch > something, also i have around 145 NAS servers to go, to try 2.6.36 :-) I think the current debugging needs analyzing first. But here is a patch which probably could matter at least wrt your first oopses. (Please try this first on something you can easily reboot.) Jarek P. --- diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 09cf56d..1b98c4c 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -409,6 +409,8 @@ static ssize_t ppp_read(struct file *file, char __user *buf, if (!pf) return -ENXIO; + + atomic_inc(&pf->refcnt); add_wait_queue(&pf->rwait, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -440,6 +442,17 @@ static ssize_t ppp_read(struct file *file, char __user *buf, set_current_state(TASK_RUNNING); remove_wait_queue(&pf->rwait, &wait); + if (atomic_dec_and_test(&pf->refcnt)) { + switch (pf->kind) { + case INTERFACE: + ppp_destroy_interface(PF_TO_PPP(pf)); + break; + case CHANNEL: + ppp_destroy_channel(PF_TO_CHANNEL(pf)); + break; + } + } + if (!skb) goto out; @@ -504,6 +517,8 @@ static unsigned int ppp_poll(struct file *file, poll_table *wait) if (!pf) return 0; + + atomic_inc(&pf->refcnt); poll_wait(file, &pf->rwait, wait); mask = POLLOUT | POLLWRNORM; if (skb_peek(&pf->rq)) @@ -518,6 +533,17 @@ static unsigned int ppp_poll(struct file *file, poll_table *wait) mask |= POLLIN | POLLRDNORM; } + if (atomic_dec_and_test(&pf->refcnt)) { + switch (pf->kind) { + case INTERFACE: + ppp_destroy_interface(PF_TO_PPP(pf)); + break; + case CHANNEL: + ppp_destroy_channel(PF_TO_CHANNEL(pf)); + break; + } + } + return mask; } ^ permalink raw reply related [flat|nested] 7+ messages in thread
end of thread, other threads:[~2010-11-03 8:59 UTC | newest] Thread overview: 7+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2010-10-25 9:22 2.6.35->2.6.36 regression, vanilla kernel panic, ppp or hrtimers crashing Denys Fedoryshchenko 2010-10-28 7:05 ` Jarek Poplawski 2010-11-02 13:49 ` Denys Fedoryshchenko 2010-11-03 7:38 ` Jarek Poplawski 2010-11-03 7:47 ` Denys Fedoryshchenko 2010-11-03 8:02 ` Jarek Poplawski [not found] ` <201011031018.21178.nuclearcat@nuclearcat.com> 2010-11-03 8:59 ` Jarek Poplawski
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).