* [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU @ 2008-10-02 0:36 Paul E. McKenney 2008-10-02 8:07 ` Ingo Molnar 0 siblings, 1 reply; 17+ messages in thread From: Paul E. McKenney @ 2008-10-02 0:36 UTC (permalink / raw) To: linux-kernel Cc: mingo, akpm, mathieu.desnoyers, laijs, manfred, dipankar, niv, dvhltc, josht Hello! This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch. One current shortcoming: on some systems, stalls are detected during early boot, when we normally would not care about them. My thought is to add a call from late initialization to suppress stall detection until the system is well along its way to being booted, but thought I should check to see if there might already be something for this purpose. (Currently against 2.6.27-rc8, FYI.) Thoughts? Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/linux/rcuclassic.h | 9 ++++ kernel/rcuclassic.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 13 ++++++ 3 files changed, 110 insertions(+) diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 4ab8436..9b62e9a 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -40,6 +40,10 @@ #include <linux/cpumask.h> #include <linux/seqlock.h> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK 3 /* for rcp->seconds_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK 30 /* for rcp->seconds_stall */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { @@ -52,6 +56,11 @@ struct rcu_ctrlblk { spinlock_t lock ____cacheline_internodealigned_in_smp; cpumask_t cpumask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started in jiffies. */ + unsigned long seconds_stall; + /* Time at which to check for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ } ____cacheline_internodealigned_in_smp; /* Is batch a before batch b ? */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index aad93cd..c092ba9 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -118,6 +118,87 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = get_seconds() - rcp->seconds_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), get_seconds(), + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(get_seconds() - rcp->seconds_stall) >= 0) + rcp->seconds_stall = + get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = get_seconds() - rcp->seconds_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -285,6 +366,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) */ smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -468,6 +550,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); + /* This cpu has pending rcu entries and the grace period * for them has completed. */ @@ -558,6 +643,9 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { +#ifdef CONFIG_DEBUG_RCU_STALL + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ memset(rdp, 0, sizeof(*rdp)); rdp->curtail = &rdp->curlist; rdp->nxttail = &rdp->nxtlist; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b50481..9fee969 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_CPU_STALL_DETECTOR + bool "Check for stalled CPUs delaying RCU grace periods" + depends on CLASSIC_RCU + default n + help + This option causes RCU to printk information on which + CPUs are delaying the current grace period, but only when + the grace period extends for excessive time periods. + + Say Y if you want RCU to perform such checks. + + Say N if you are unsure. + config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL ^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 0:36 [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney @ 2008-10-02 8:07 ` Ingo Molnar 2008-10-02 14:05 ` Paul E. McKenney 0 siblings, 1 reply; 17+ messages in thread From: Ingo Molnar @ 2008-10-02 8:07 UTC (permalink / raw) To: Paul E. McKenney Cc: linux-kernel, akpm, mathieu.desnoyers, laijs, manfred, dipankar, niv, dvhltc, josht * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > Hello! > > This patch adds stalled-CPU detection to Classic RCU. This capability > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > which defaults disabled. This is a debugging feature, not something > that non-kernel-hackers would be expected to care about. This feature > can detect looping CPUs in !PREEMPT builds and looping CPUs with > preemption disabled in PREEMPT builds. This is essentially a port of > this functionality from the treercu patch. > > One current shortcoming: on some systems, stalls are detected during > early boot, when we normally would not care about them. My thought is > to add a call from late initialization to suppress stall detection > until the system is well along its way to being booted, but thought I > should check to see if there might already be something for this > purpose. could you be a bit more specific, why do those warnings show up and why dont we care about them? There are things like networking that occasionally do an rcu_sync() and a stall could mean a bootup hang. > (Currently against 2.6.27-rc8, FYI.) > > Thoughts? > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> i think this is a very good idea in general - often the question comes up whether a hang seen in the RCU code is indeed caused by RCU or other factors. Could you perhaps rebase it against tip/core/rcu ? [or tip/master for convenience] Ingo ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 8:07 ` Ingo Molnar @ 2008-10-02 14:05 ` Paul E. McKenney 2008-10-02 15:10 ` Mathieu Desnoyers 2008-10-02 17:39 ` [PATCH,RFC] " Paul E. McKenney 0 siblings, 2 replies; 17+ messages in thread From: Paul E. McKenney @ 2008-10-02 14:05 UTC (permalink / raw) To: Ingo Molnar Cc: linux-kernel, akpm, mathieu.desnoyers, laijs, manfred, dipankar, niv, dvhltc, josht On Thu, Oct 02, 2008 at 10:07:26AM +0200, Ingo Molnar wrote: > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > > > Hello! > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > which defaults disabled. This is a debugging feature, not something > > that non-kernel-hackers would be expected to care about. This feature > > can detect looping CPUs in !PREEMPT builds and looping CPUs with > > preemption disabled in PREEMPT builds. This is essentially a port of > > this functionality from the treercu patch. > > > > One current shortcoming: on some systems, stalls are detected during > > early boot, when we normally would not care about them. My thought is > > to add a call from late initialization to suppress stall detection > > until the system is well along its way to being booted, but thought I > > should check to see if there might already be something for this > > purpose. > > could you be a bit more specific, why do those warnings show up and why > dont we care about them? There are things like networking that > occasionally do an rcu_sync() and a stall could mean a bootup hang. Hmmm... Good point, I was just falling back on my old "we don't care about RCU stalls in boot-time code" rule from long ago. It is entirely possible (in fact reasonably likely) that this rule no longer applies to Linux as it exists today. So please see below for the console output. Not all systems report this stall. I have recently been running only on Power systems, will fire off on some x86s. My kneejerk reaction was that the "stall" was really due to the clock-setting operation -- RCU stalls are based on get_seconds(). So maybe I need to change the stall-detection code to use jiffies instead. Thoughts? > > (Currently against 2.6.27-rc8, FYI.) > > > > Thoughts? > > > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > i think this is a very good idea in general - often the question comes > up whether a hang seen in the RCU code is indeed caused by RCU or other > factors. Could you perhaps rebase it against tip/core/rcu ? [or > tip/master for convenience] Will do! Thanx, Paul ------------------------------------------------------------------------ Console output: search for "RCU detected CPU 5 stall" near the end. ------------------------------------------------------------------------ Elapsed time since release of system processors: 7742 mins 58 secs Config file read, 2048 bytes Welcome Welcome to yaboot version 1.3.13-cas (Red Hat 1.3.13.cas-5) Enter "help" to get some basic usage information boot: autobench Please wait, loading kernel... Elf64 kernel loaded... Loading ramdisk... ramdisk loaded at 02b00000, size: 2331 Kbytes OF stdout device is: /vdevice/vty@30000000 Hypertas detected, assuming LPAR ! -\|/-\|/-\|/-\| Elapsed time since release of system processors: 7743 mins 29 secs Config file read, 2048 bytes Welcome Welcome to yaboot version 1.3.13-cas (Red Hat 1.3.13.cas-5) Enter "help" to get some basic usage information boot: autobench Please wait, loading kernel... Elf64 kernel loaded... Loading ramdisk... ramdisk loaded at 02b00000, size: 2331 Kbytes OF stdout device is: /vdevice/vty@30000000 Hypertas detected, assuming LPAR ! command line: ro console=hvc0 autobench_args: root=/dev/sda6 ABAT:1222922465 memory layout at init: alloc_bottom : 0000000002d47000 alloc_top : 0000000008000000 alloc_top_hi : 0000000008000000 rmo_top : 0000000008000000 ram_top : 0000000008000000 Looking for displays instantiating rtas at 0x000000000757a000 ... done boot cpu hw idx 0000000000000000 starting cpu hw idx 0000000000000002... done starting cpu hw idx 0000000000000004... done starting cpu hw idx 0000000000000006... done copying OF device tree ... Building dt strings... Building dt structure... Device tree strings 0x0000000002f48000 -> 0x0000000002f49537 Device tree struct 0x0000000002f4a000 -> 0x0000000002f58000 Calling quiesce ... returning from prom_init Using pSeries machine description Using 1TB segments Found initrd at 0xc000000002b00000:0xc000000002d46c00 console [udbg0] enabled Partition configured for 8 cpus. CPU maps initialized for 2 threads per core Starting Linux PPC64 #1 SMP PREEMPT Thu Oct 2 00:39:54 EDT 2008 ----------------------------------------------------- ppc64_pft_size = 0x19 physicalMemorySize = 0x80000000 htab_hash_mask = 0x3ffff ----------------------------------------------------- Initializing cgroup subsys cpuset Linux version 2.6.27-rc7-autokern1 (root@tundro2.rchland.ibm.com) (gcc version 4.1.2 20071124 (Red Hat 4.1.2-42)) #1 SMP PREEMPT Thu Oct 2 00:39:54 EDT 2008 [boot]0012 Setup Arch EEH: No capable adapters found PPC64 nvram contains 15360 bytes Zone PFN ranges: DMA 0x00000000 -> 0x00080000 Normal 0x00080000 -> 0x00080000 Movable zone start PFN for each node early_node_map[2] active PFN ranges 0: 0x00000000 -> 0x00044000 1: 0x00044000 -> 0x00080000 [boot]0015 Setup Done Built 2 zonelists in Node order, mobility grouping on. Total pages: 517120 Policy zone: DMA Kernel command line: ro console=hvc0 autobench_args: root=/dev/sda6 ABAT:1222922465 Experimental hierarchical RCU implementation. RCU-based detection of stalled CPUs is enabled. Experimental hierarchical RCU init done. [boot]0020 XICS Init [boot]0021 XICS Done PID hash table entries: 4096 (order: 12, 32768 bytes) clocksource: timebase mult[7d0000] shift[22] registered Console: colour dummy device 80x25 console handover: boot [udbg0] -> real [hvc0] Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes) Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes) freeing bootmem node 0 freeing bootmem node 1 Memory: 2038156k/2097152k available (7200k kernel code, 58996k reserved, 1052k data, 513k bss, 292k init) SLUB: Genslabs=13, HWalign=128, Order=0-3, MinObjects=0, CPUs=8, Nodes=16 Calibrating delay loop... 915.45 BogoMIPS (lpj=1830912) Mount-cache hash table entries: 256 Initializing cgroup subsys ns Initializing cgroup subsys cpuacct Processor 1 found. Processor 2 found. Processor 3 found. Processor 4 found. Processor 5 found. Processor 6 found. Processor 7 found. Brought up 8 CPUs net_namespace: 1152 bytes NET: Registered protocol family 16 IBM eBus Device Driver PCI: Probing PCI hardware SCSI subsystem initialized usbcore: registered new interface driver usbfs usbcore: registered new interface driver hub usbcore: registered new device driver usb NET: Registered protocol family 2 IP route cache hash table entries: 65536 (order: 7, 524288 bytes) TCP established hash table entries: 262144 (order: 10, 4194304 bytes) TCP bind hash table entries: 65536 (order: 8, 1048576 bytes) TCP: Hash tables configured (established 262144 bind 65536) TCP reno registered NET: Registered protocol family 1 checking if image is initramfs... it is Freeing initrd memory: 2331k freed IOMMU table initialized, virtual merging enabled audit: initializing netlink socket (disabled) type=2000 audit(1222908206.468:1): initialized HugeTLB registered 16 MB page size, pre-allocated 0 pages HugeTLB registered 16 GB page size, pre-allocated 0 pages HugeTLB registered 64 KB page size, pre-allocated 0 pages Installing knfsd (copyright (C) 1996 okir@monad.swb.de). msgmni has been set to 3985 Block layer SCSI generic (bsg) driver version 0.4 loaded (major 254) io scheduler noop registered io scheduler anticipatory registered (default) io scheduler deadline registered io scheduler cfq registered Generic RTC Driver v1.07 Serial: 8250/16550 driver4 ports, IRQ sharing disabled brd: module loaded loop: module loaded Intel(R) PRO/1000 Network Driver - version 7.3.20-k3-NAPI Copyright (c) 1999-2006 Intel Corporation. pcnet32.c:v1.35 21.Apr.2008 tsbogend@alpha.franken.de e100: Intel(R) PRO/100 Network Driver, 3.5.23-k4-NAPI e100: Copyright(c) 1999-2006 Intel Corporation console [netcon0] enabled netconsole: network logging started Uniform Multi-Platform E-IDE driver ipr: IBM Power RAID SCSI Device Driver version: 2.4.1 (April 24, 2007) ibmvscsi 30000003: SRP_VERSION: 16.a scsi0 : IBM POWER Virtual SCSI Adapter 1.5.8 ibmvscsi 30000003: partner initialization complete ibmvscsi 30000003: sent SRP login ibmvscsi 30000003: SRP_LOGIN succeeded ibmvscsi 30000003: host srp version: 16.a, host partition tundro1 (1), OS 2, max io 262144 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi 0:0:1:0: Direct-Access IBM VDASD blkdev 0001 PQ: 0 ANSI: 4 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 scsi scan: INQUIRY result too short (5), using 36 st: Version 20080504, fixed bufsize 32768, s/g segs 256 Driver 'st' needs updating - please use bus_type methods Driver 'sd' needs updating - please use bus_type methods sd 0:0:1:0: [sda] 73400922 512-byte hardware sectors (37581 MB) sd 0:0:1:0: [sda] Write Protect is off sd 0:0:1:0: [sda] Write cache: disabled, read cache: disabled, doesn't support DPO or FUA sd 0:0:1:0: [sda] 73400922 512-byte hardware sectors (37581 MB) sd 0:0:1:0: [sda] Write Protect is off sd 0:0:1:0: [sda] Write cache: disabled, read cache: disabled, doesn't support DPO or FUA sda: sda1 sda2 sda3 sda4 < sda5 sda6 sda7 > sd 0:0:1:0: [sda] Attached SCSI disk Driver 'sr' needs updating - please use bus_type methods sd 0:0:1:0: Attached scsi generic sg0 type 0 Initializing USB Mass Storage driver... usbcore: registered new interface driver usb-storage USB Mass Storage support registered. mice: PS/2 mouse device common for all mice md: linear personality registered for level -1 md: raid0 personality registered for level 0 md: raid1 personality registered for level 1 device-mapper: ioctl: 4.14.0-ioctl (2008-04-23) initialised: dm-devel@redhat.com usbcore: registered new interface driver hiddev usbcore: registered new interface driver usbhid usbhid: v2.6:USB HID core driver IPv4 over IPv4 tunneling driver TCP cubic registered NET: Registered protocol family 17 RPC: Registered udp transport module. RPC: Registered tcp transport module. registered taskstats version 1 Freeing unused kernel memory: 292k freed Red Hat nash version 5.1.19.6 starting Mounting proc filesystem Mounting sysfs filesystem Creating /dev Creating initial device nodes Setting up hotplug. Creating block device nodes. Loading ehea.ko module IBM eHEA ethernet device driver (Release EHEA_0092) Scanning and configuring dmraid supported devices Creating root device. Mounting root filesystem. kjournald starting. Commit interval 5 seconds EXT3-fs: mounted filesystem with ordered data mode. Setting up other filesystems. Setting up new root fs no fstab.sys, mounting internal defaults Switching to new root and running init. unmounting old /dev unmounting old /proc unmounting old /sys INIT: version 2.86 booting Welcome to Red Hat Enterprise Linux Server Press 'I' to enter interactive startup. RCU detected CPU 5 stall (t=1222922609/2) Call Trace: [c000000042d03920] [c00000000000fdec] .show_stack+0x70/0x184 (unreliable) [c000000042d039d0] [c0000000000a1de0] .__rcu_pending+0x98/0x2ec [c000000042d03a70] [c0000000000a206c] .rcu_pending+0x38/0x88 [c000000042d03af0] [c00000000006363c] .update_process_times+0x48/0x94 [c000000042d03b80] [c00000000007bae4] .tick_sched_timer+0xbc/0x118 [c000000042d03c20] [c000000000073a88] .__run_hrtimer+0x78/0x118 [c000000042d03cc0] [c000000000074b3c] .hrtimer_interrupt+0x128/0x1e4 [c000000042d03d90] [c00000000002304c] .timer_interrupt+0xe0/0x15c [c000000042d03e30] [c000000000003700] decrementer_common+0x100/0x180 Setting clock (localtime): Thu Oct 2 00:43:30 EDT 2008 [ OK ] Starting udev: [ OK ] Setting hostname tundro2.rchland.ibm.com: [ OK ] No devices found Setting up Logical Volume Management: No volume groups found [ OK ] Checking filesystems Checking all file systems. [/sbin/fsck.ext3 (1) -- /] fsck.ext3 -a /dev/sda6 /dev/sda6: Superblock last mount time is in the future. FIXED. /dev/sda6: clean, 134758/1281696 files, 853722/2560336 blocks [ OK ] Remounting root filesystem in read-write mode: [ OK ] Mounting local filesystems: [ OK ] Enabling local filesystem quotas: quotaon: Warning: No quota format detected in the kernel. [ OK ] Enabling /etc/fstab swaps: swapon: /dev/sda5: Invalid argument [FAILED] INIT: Entering runlevel: 3 Entering non-interactive startup Starting boot.loadmodules: [ OK ] Starting background readahead: [ OK ] Bringing up loopback interface: [ OK ] Bringing up interface eth0: [ OK ] Starting auditd: [ OK ] Starting system logger: [ OK ] Starting kernel logger: [ OK ] Starting irqbalance: [ OK ] Starting RPC idmapd: FATAL: Module sunrpc not found. FATAL: Error running install command for sunrpc Error: RPC MTAB does not exist. Starting system message bus: [ OK ] Starting Bluetooth services:[ OK ][ OK ]Can't open RFCOMM control socket: Address family not supported by protocol Mounting other filesystems: [ OK ] Starting PC/SC smart card daemon (pcscd): [ OK ] Starting hidd: Can't open HIDP control socket: Address family not supported by protocol [FAILED] Starting autofs: Loading autofs4: [ OK ] Starting automount: [ OK ] [ OK ] Starting iprinit: Starting ipr initialization daemon[ OK ] [ OK ] Starting iprupdate: Checking ipr microcode levels Completed ipr microcode updates[ OK ] [ OK ] Starting iprdump: Starting ipr dump daemon[ OK ] [ OK ] Starting sshd: [ OK ] Starting cups: [ OK ] Starting xinetd: [ OK ] Starting vpdupdate: [ OK ] Starting console mouse services: [ OK ] Starting crond: [ OK ] Starting xfs: [ OK ] Starting anacron: [ OK ] Starting atd: [ OK ] Starting yum-updatesd: [ OK ] Starting Avahi daemon... [ OK ] Starting HAL daemon: [ OK ] Starting smartd: [ OK ] Red Hat Enterprise Linux Server release 5.2 (Tikanga) Kernel 2.6.27-rc7-autokern1 on an ppc64 tundro2.rchland.ibm.com login:-- 0:conmux-control -- time-stamp -- Oct/01/08 21:45:47 -- -- 0:conmux-control -- time-stamp -- Oct/01/08 22:01:03 -- (bot:conmon-payload) disconnected ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 14:05 ` Paul E. McKenney @ 2008-10-02 15:10 ` Mathieu Desnoyers 2008-10-02 17:43 ` Paul E. McKenney 2008-10-02 17:39 ` [PATCH,RFC] " Paul E. McKenney 1 sibling, 1 reply; 17+ messages in thread From: Mathieu Desnoyers @ 2008-10-02 15:10 UTC (permalink / raw) To: Paul E. McKenney Cc: Ingo Molnar, linux-kernel, akpm, laijs, manfred, dipankar, niv, dvhltc, josht * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > On Thu, Oct 02, 2008 at 10:07:26AM +0200, Ingo Molnar wrote: > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > > > > > Hello! > > > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > > which defaults disabled. This is a debugging feature, not something > > > that non-kernel-hackers would be expected to care about. This feature > > > can detect looping CPUs in !PREEMPT builds and looping CPUs with > > > preemption disabled in PREEMPT builds. This is essentially a port of > > > this functionality from the treercu patch. > > > > > > One current shortcoming: on some systems, stalls are detected during > > > early boot, when we normally would not care about them. My thought is > > > to add a call from late initialization to suppress stall detection > > > until the system is well along its way to being booted, but thought I > > > should check to see if there might already be something for this > > > purpose. > > > > could you be a bit more specific, why do those warnings show up and why > > dont we care about them? There are things like networking that > > occasionally do an rcu_sync() and a stall could mean a bootup hang. > > Hmmm... > > Good point, I was just falling back on my old "we don't care about RCU > stalls in boot-time code" rule from long ago. It is entirely possible > (in fact reasonably likely) that this rule no longer applies to Linux as > it exists today. So please see below for the console output. Not all > systems report this stall. I have recently been running only on Power > systems, will fire off on some x86s. My kneejerk reaction was that the > "stall" was really due to the clock-setting operation -- RCU stalls are > based on get_seconds(). > > So maybe I need to change the stall-detection code to use jiffies > instead. > > Thoughts? > You could possibly use get monotonic time, which should hopefully be a bit better than its NTP-corrected alternatives. Also, do you have all the tools you need to poinpoint the precise source of RCU stalls ? I haven't looked at RCU trace, but I should say adding a new tracepoint to instrument RCU callback execution at boot time and to trace it with LTTng is close to trivial. (actually, very early kernel boot is not so trivial as we would have to create a small kernel module to activate lttng tracing, but tracing the early phases of userspace bootup is trivial given we can use the userspace API). Mathieu > > > (Currently against 2.6.27-rc8, FYI.) > > > > > > Thoughts? > > > > > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > > > i think this is a very good idea in general - often the question comes > > up whether a hang seen in the RCU code is indeed caused by RCU or other > > factors. Could you perhaps rebase it against tip/core/rcu ? [or > > tip/master for convenience] > > Will do! > > Thanx, Paul > > ------------------------------------------------------------------------ > Console output: search for "RCU detected CPU 5 stall" near the end. > ------------------------------------------------------------------------ > > Elapsed time since release of system processors: 7742 mins 58 secs > > Config file read, 2048 bytes > Welcome > Welcome to yaboot version 1.3.13-cas (Red Hat 1.3.13.cas-5) > Enter "help" to get some basic usage information > boot: autobench > Please wait, loading kernel... > Elf64 kernel loaded... > Loading ramdisk... > ramdisk loaded at 02b00000, size: 2331 Kbytes > OF stdout device is: /vdevice/vty@30000000 > Hypertas detected, assuming LPAR ! > -\|/-\|/-\|/-\| > Elapsed time since release of system processors: 7743 mins 29 secs > > Config file read, 2048 bytes > Welcome > Welcome to yaboot version 1.3.13-cas (Red Hat 1.3.13.cas-5) > Enter "help" to get some basic usage information > boot: autobench > Please wait, loading kernel... > Elf64 kernel loaded... > Loading ramdisk... > ramdisk loaded at 02b00000, size: 2331 Kbytes > OF stdout device is: /vdevice/vty@30000000 > Hypertas detected, assuming LPAR ! > command line: ro console=hvc0 autobench_args: root=/dev/sda6 > ABAT:1222922465 > memory layout at init: > alloc_bottom : 0000000002d47000 > alloc_top : 0000000008000000 > alloc_top_hi : 0000000008000000 > rmo_top : 0000000008000000 > ram_top : 0000000008000000 > Looking for displays > instantiating rtas at 0x000000000757a000 ... done > boot cpu hw idx 0000000000000000 > starting cpu hw idx 0000000000000002... done > starting cpu hw idx 0000000000000004... done > starting cpu hw idx 0000000000000006... done > copying OF device tree ... > Building dt strings... > Building dt structure... > Device tree strings 0x0000000002f48000 -> 0x0000000002f49537 > Device tree struct 0x0000000002f4a000 -> 0x0000000002f58000 > Calling quiesce ... > returning from prom_init > Using pSeries machine description > Using 1TB segments > Found initrd at 0xc000000002b00000:0xc000000002d46c00 > console [udbg0] enabled > Partition configured for 8 cpus. > CPU maps initialized for 2 threads per core > Starting Linux PPC64 #1 SMP PREEMPT Thu Oct 2 00:39:54 EDT 2008 > ----------------------------------------------------- > ppc64_pft_size = 0x19 > physicalMemorySize = 0x80000000 > htab_hash_mask = 0x3ffff > ----------------------------------------------------- > Initializing cgroup subsys cpuset > Linux version 2.6.27-rc7-autokern1 (root@tundro2.rchland.ibm.com) (gcc > version 4.1.2 20071124 (Red Hat 4.1.2-42)) #1 SMP PREEMPT Thu Oct 2 > 00:39:54 EDT 2008 > [boot]0012 Setup Arch > EEH: No capable adapters found > PPC64 nvram contains 15360 bytes > Zone PFN ranges: > DMA 0x00000000 -> 0x00080000 > Normal 0x00080000 -> 0x00080000 > Movable zone start PFN for each node > early_node_map[2] active PFN ranges > 0: 0x00000000 -> 0x00044000 > 1: 0x00044000 -> 0x00080000 > [boot]0015 Setup Done > Built 2 zonelists in Node order, mobility grouping on. Total pages: > 517120 > Policy zone: DMA > Kernel command line: ro console=hvc0 autobench_args: root=/dev/sda6 > ABAT:1222922465 > Experimental hierarchical RCU implementation. > RCU-based detection of stalled CPUs is enabled. > Experimental hierarchical RCU init done. > [boot]0020 XICS Init > [boot]0021 XICS Done > PID hash table entries: 4096 (order: 12, 32768 bytes) > clocksource: timebase mult[7d0000] shift[22] registered > Console: colour dummy device 80x25 > console handover: boot [udbg0] -> real [hvc0] > Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes) > Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes) > freeing bootmem node 0 > freeing bootmem node 1 > Memory: 2038156k/2097152k available (7200k kernel code, 58996k reserved, > 1052k data, 513k bss, 292k init) > SLUB: Genslabs=13, HWalign=128, Order=0-3, MinObjects=0, CPUs=8, > Nodes=16 > Calibrating delay loop... 915.45 BogoMIPS (lpj=1830912) > Mount-cache hash table entries: 256 > Initializing cgroup subsys ns > Initializing cgroup subsys cpuacct > Processor 1 found. > Processor 2 found. > Processor 3 found. > Processor 4 found. > Processor 5 found. > Processor 6 found. > Processor 7 found. > Brought up 8 CPUs > net_namespace: 1152 bytes > NET: Registered protocol family 16 > IBM eBus Device Driver > PCI: Probing PCI hardware > SCSI subsystem initialized > usbcore: registered new interface driver usbfs > usbcore: registered new interface driver hub > usbcore: registered new device driver usb > NET: Registered protocol family 2 > IP route cache hash table entries: 65536 (order: 7, 524288 bytes) > TCP established hash table entries: 262144 (order: 10, 4194304 bytes) > TCP bind hash table entries: 65536 (order: 8, 1048576 bytes) > TCP: Hash tables configured (established 262144 bind 65536) > TCP reno registered > NET: Registered protocol family 1 > checking if image is initramfs... it is > Freeing initrd memory: 2331k freed > IOMMU table initialized, virtual merging enabled > audit: initializing netlink socket (disabled) > type=2000 audit(1222908206.468:1): initialized > HugeTLB registered 16 MB page size, pre-allocated 0 pages > HugeTLB registered 16 GB page size, pre-allocated 0 pages > HugeTLB registered 64 KB page size, pre-allocated 0 pages > Installing knfsd (copyright (C) 1996 okir@monad.swb.de). > msgmni has been set to 3985 > Block layer SCSI generic (bsg) driver version 0.4 loaded (major 254) > io scheduler noop registered > io scheduler anticipatory registered (default) > io scheduler deadline registered > io scheduler cfq registered > Generic RTC Driver v1.07 > Serial: 8250/16550 driver4 ports, IRQ sharing disabled > brd: module loaded > loop: module loaded > Intel(R) PRO/1000 Network Driver - version 7.3.20-k3-NAPI > Copyright (c) 1999-2006 Intel Corporation. > pcnet32.c:v1.35 21.Apr.2008 tsbogend@alpha.franken.de > e100: Intel(R) PRO/100 Network Driver, 3.5.23-k4-NAPI > e100: Copyright(c) 1999-2006 Intel Corporation > console [netcon0] enabled > netconsole: network logging started > Uniform Multi-Platform E-IDE driver > ipr: IBM Power RAID SCSI Device Driver version: 2.4.1 (April 24, 2007) > ibmvscsi 30000003: SRP_VERSION: 16.a > scsi0 : IBM POWER Virtual SCSI Adapter 1.5.8 > ibmvscsi 30000003: partner initialization complete > ibmvscsi 30000003: sent SRP login > ibmvscsi 30000003: SRP_LOGIN succeeded > ibmvscsi 30000003: host srp version: 16.a, host partition tundro1 (1), > OS 2, max io 262144 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi 0:0:1:0: Direct-Access IBM VDASD blkdev 0001 PQ: 0 > ANSI: 4 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > scsi scan: INQUIRY result too short (5), using 36 > st: Version 20080504, fixed bufsize 32768, s/g segs 256 > Driver 'st' needs updating - please use bus_type methods > Driver 'sd' needs updating - please use bus_type methods > sd 0:0:1:0: [sda] 73400922 512-byte hardware sectors (37581 MB) > sd 0:0:1:0: [sda] Write Protect is off > sd 0:0:1:0: [sda] Write cache: disabled, read cache: disabled, doesn't > support DPO or FUA > sd 0:0:1:0: [sda] 73400922 512-byte hardware sectors (37581 MB) > sd 0:0:1:0: [sda] Write Protect is off > sd 0:0:1:0: [sda] Write cache: disabled, read cache: disabled, doesn't > support DPO or FUA > sda: sda1 sda2 sda3 sda4 < sda5 sda6 sda7 > > sd 0:0:1:0: [sda] Attached SCSI disk > Driver 'sr' needs updating - please use bus_type methods > sd 0:0:1:0: Attached scsi generic sg0 type 0 > Initializing USB Mass Storage driver... > usbcore: registered new interface driver usb-storage > USB Mass Storage support registered. > mice: PS/2 mouse device common for all mice > md: linear personality registered for level -1 > md: raid0 personality registered for level 0 > md: raid1 personality registered for level 1 > device-mapper: ioctl: 4.14.0-ioctl (2008-04-23) initialised: > dm-devel@redhat.com > usbcore: registered new interface driver hiddev > usbcore: registered new interface driver usbhid > usbhid: v2.6:USB HID core driver > IPv4 over IPv4 tunneling driver > TCP cubic registered > NET: Registered protocol family 17 > RPC: Registered udp transport module. > RPC: Registered tcp transport module. > registered taskstats version 1 > Freeing unused kernel memory: 292k freed > Red Hat nash version 5.1.19.6 starting > Mounting proc filesystem > Mounting sysfs filesystem > Creating /dev > Creating initial device nodes > Setting up hotplug. > Creating block device nodes. > Loading ehea.ko module > IBM eHEA ethernet device driver (Release EHEA_0092) > Scanning and configuring dmraid supported devices > Creating root device. > Mounting root filesystem. > kjournald starting. Commit interval 5 seconds > EXT3-fs: mounted filesystem with ordered data mode. > Setting up other filesystems. > Setting up new root fs > no fstab.sys, mounting internal defaults > Switching to new root and running init. > unmounting old /dev > unmounting old /proc > unmounting old /sys > INIT: version 2.86 booting > Welcome to Red Hat Enterprise Linux Server > Press 'I' to enter interactive startup. > RCU detected CPU 5 stall (t=1222922609/2) > Call Trace: > [c000000042d03920] [c00000000000fdec] .show_stack+0x70/0x184 > (unreliable) > [c000000042d039d0] [c0000000000a1de0] .__rcu_pending+0x98/0x2ec > [c000000042d03a70] [c0000000000a206c] .rcu_pending+0x38/0x88 > [c000000042d03af0] [c00000000006363c] .update_process_times+0x48/0x94 > [c000000042d03b80] [c00000000007bae4] .tick_sched_timer+0xbc/0x118 > [c000000042d03c20] [c000000000073a88] .__run_hrtimer+0x78/0x118 > [c000000042d03cc0] [c000000000074b3c] .hrtimer_interrupt+0x128/0x1e4 > [c000000042d03d90] [c00000000002304c] .timer_interrupt+0xe0/0x15c > [c000000042d03e30] [c000000000003700] decrementer_common+0x100/0x180 > Setting clock (localtime): Thu Oct 2 00:43:30 EDT 2008 [ OK ] > Starting udev: [ OK ] > Setting hostname tundro2.rchland.ibm.com: [ OK ] > No devices found > Setting up Logical Volume Management: No volume groups found > [ OK ] > Checking filesystems > Checking all file systems. > [/sbin/fsck.ext3 (1) -- /] fsck.ext3 -a /dev/sda6 > /dev/sda6: Superblock last mount time is in the future. FIXED. > /dev/sda6: clean, 134758/1281696 files, 853722/2560336 blocks > [ OK ] > Remounting root filesystem in read-write mode: [ OK ] > Mounting local filesystems: [ OK ] > Enabling local filesystem quotas: quotaon: Warning: No quota format > detected in the kernel. > [ OK ] > Enabling /etc/fstab swaps: swapon: /dev/sda5: Invalid argument > [FAILED] > INIT: Entering runlevel: 3 > Entering non-interactive startup > Starting boot.loadmodules: [ OK ] > Starting background readahead: [ OK ] > Bringing up loopback interface: [ OK ] > Bringing up interface eth0: [ OK ] > Starting auditd: [ OK ] > Starting system logger: [ OK ] > Starting kernel logger: [ OK ] > Starting irqbalance: [ OK ] > Starting RPC idmapd: FATAL: Module sunrpc not found. > FATAL: Error running install command for sunrpc > Error: RPC MTAB does not exist. > Starting system message bus: [ OK ] > Starting Bluetooth services:[ OK ][ OK ]Can't open RFCOMM control > socket: Address family not supported by protocol > > Mounting other filesystems: [ OK ] > Starting PC/SC smart card daemon (pcscd): [ OK ] > Starting hidd: Can't open HIDP control socket: Address family not > supported by protocol > [FAILED] > Starting autofs: Loading autofs4: [ OK ] > Starting automount: [ OK ] > [ OK ] > Starting iprinit: Starting ipr initialization daemon[ OK ] > [ OK ] > Starting iprupdate: Checking ipr microcode levels > Completed ipr microcode updates[ OK ] > [ OK ] > Starting iprdump: Starting ipr dump daemon[ OK ] > [ OK ] > Starting sshd: [ OK ] > Starting cups: [ OK ] > Starting xinetd: [ OK ] > Starting vpdupdate: [ OK ] > Starting console mouse services: [ OK ] > Starting crond: [ OK ] > Starting xfs: [ OK ] > Starting anacron: [ OK ] > Starting atd: [ OK ] > Starting yum-updatesd: [ OK ] > Starting Avahi daemon... [ OK ] > Starting HAL daemon: [ OK ] > Starting smartd: [ OK ] > > Red Hat Enterprise Linux Server release 5.2 (Tikanga) > Kernel 2.6.27-rc7-autokern1 on an ppc64 > > tundro2.rchland.ibm.com login:-- 0:conmux-control -- time-stamp -- > Oct/01/08 21:45:47 -- > -- 0:conmux-control -- time-stamp -- Oct/01/08 22:01:03 -- > (bot:conmon-payload) disconnected -- Mathieu Desnoyers OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 15:10 ` Mathieu Desnoyers @ 2008-10-02 17:43 ` Paul E. McKenney 2008-10-02 18:51 ` [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU (LTTng support) Mathieu Desnoyers 0 siblings, 1 reply; 17+ messages in thread From: Paul E. McKenney @ 2008-10-02 17:43 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, linux-kernel, akpm, laijs, manfred, dipankar, niv, dvhltc, josht On Thu, Oct 02, 2008 at 11:10:46AM -0400, Mathieu Desnoyers wrote: > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > > On Thu, Oct 02, 2008 at 10:07:26AM +0200, Ingo Molnar wrote: > > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > > > > > > > Hello! > > > > > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > > > which defaults disabled. This is a debugging feature, not something > > > > that non-kernel-hackers would be expected to care about. This feature > > > > can detect looping CPUs in !PREEMPT builds and looping CPUs with > > > > preemption disabled in PREEMPT builds. This is essentially a port of > > > > this functionality from the treercu patch. > > > > > > > > One current shortcoming: on some systems, stalls are detected during > > > > early boot, when we normally would not care about them. My thought is > > > > to add a call from late initialization to suppress stall detection > > > > until the system is well along its way to being booted, but thought I > > > > should check to see if there might already be something for this > > > > purpose. > > > > > > could you be a bit more specific, why do those warnings show up and why > > > dont we care about them? There are things like networking that > > > occasionally do an rcu_sync() and a stall could mean a bootup hang. > > > > Hmmm... > > > > Good point, I was just falling back on my old "we don't care about RCU > > stalls in boot-time code" rule from long ago. It is entirely possible > > (in fact reasonably likely) that this rule no longer applies to Linux as > > it exists today. So please see below for the console output. Not all > > systems report this stall. I have recently been running only on Power > > systems, will fire off on some x86s. My kneejerk reaction was that the > > "stall" was really due to the clock-setting operation -- RCU stalls are > > based on get_seconds(). > > > > So maybe I need to change the stall-detection code to use jiffies > > instead. > > > > Thoughts? > > > > You could possibly use get monotonic time, which should hopefully be a > bit better than its NTP-corrected alternatives. OK. I thought jiffies were monotonic (though not perfectly correlated to the passage of time). This is a warning timeout, so does not need better than a few percent accuracy, as long as time never jumps too far. But what API did you have in mind? > Also, do you have all the tools you need to poinpoint the precise source > of RCU stalls ? I haven't looked at RCU trace, but I should say adding a > new tracepoint to instrument RCU callback execution at boot time and to > trace it with LTTng is close to trivial. (actually, very early kernel > boot is not so trivial as we would have to create a small kernel module > to activate lttng tracing, but tracing the early phases of userspace > bootup is trivial given we can use the userspace API). I just dump stack, which historically worked pretty well. Could you please tell me more about what you think should be added? Thanx, Paul > Mathieu > > > > > (Currently against 2.6.27-rc8, FYI.) > > > > > > > > Thoughts? > > > > > > > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > > > > > i think this is a very good idea in general - often the question comes > > > up whether a hang seen in the RCU code is indeed caused by RCU or other > > > factors. Could you perhaps rebase it against tip/core/rcu ? [or > > > tip/master for convenience] > > > > Will do! > > > > Thanx, Paul > > > > ------------------------------------------------------------------------ > > Console output: search for "RCU detected CPU 5 stall" near the end. > > ------------------------------------------------------------------------ > > > > Elapsed time since release of system processors: 7742 mins 58 secs > > > > Config file read, 2048 bytes > > Welcome > > Welcome to yaboot version 1.3.13-cas (Red Hat 1.3.13.cas-5) > > Enter "help" to get some basic usage information > > boot: autobench > > Please wait, loading kernel... > > Elf64 kernel loaded... > > Loading ramdisk... > > ramdisk loaded at 02b00000, size: 2331 Kbytes > > OF stdout device is: /vdevice/vty@30000000 > > Hypertas detected, assuming LPAR ! > > -\|/-\|/-\|/-\| > > Elapsed time since release of system processors: 7743 mins 29 secs > > > > Config file read, 2048 bytes > > Welcome > > Welcome to yaboot version 1.3.13-cas (Red Hat 1.3.13.cas-5) > > Enter "help" to get some basic usage information > > boot: autobench > > Please wait, loading kernel... > > Elf64 kernel loaded... > > Loading ramdisk... > > ramdisk loaded at 02b00000, size: 2331 Kbytes > > OF stdout device is: /vdevice/vty@30000000 > > Hypertas detected, assuming LPAR ! > > command line: ro console=hvc0 autobench_args: root=/dev/sda6 > > ABAT:1222922465 > > memory layout at init: > > alloc_bottom : 0000000002d47000 > > alloc_top : 0000000008000000 > > alloc_top_hi : 0000000008000000 > > rmo_top : 0000000008000000 > > ram_top : 0000000008000000 > > Looking for displays > > instantiating rtas at 0x000000000757a000 ... done > > boot cpu hw idx 0000000000000000 > > starting cpu hw idx 0000000000000002... done > > starting cpu hw idx 0000000000000004... done > > starting cpu hw idx 0000000000000006... done > > copying OF device tree ... > > Building dt strings... > > Building dt structure... > > Device tree strings 0x0000000002f48000 -> 0x0000000002f49537 > > Device tree struct 0x0000000002f4a000 -> 0x0000000002f58000 > > Calling quiesce ... > > returning from prom_init > > Using pSeries machine description > > Using 1TB segments > > Found initrd at 0xc000000002b00000:0xc000000002d46c00 > > console [udbg0] enabled > > Partition configured for 8 cpus. > > CPU maps initialized for 2 threads per core > > Starting Linux PPC64 #1 SMP PREEMPT Thu Oct 2 00:39:54 EDT 2008 > > ----------------------------------------------------- > > ppc64_pft_size = 0x19 > > physicalMemorySize = 0x80000000 > > htab_hash_mask = 0x3ffff > > ----------------------------------------------------- > > Initializing cgroup subsys cpuset > > Linux version 2.6.27-rc7-autokern1 (root@tundro2.rchland.ibm.com) (gcc > > version 4.1.2 20071124 (Red Hat 4.1.2-42)) #1 SMP PREEMPT Thu Oct 2 > > 00:39:54 EDT 2008 > > [boot]0012 Setup Arch > > EEH: No capable adapters found > > PPC64 nvram contains 15360 bytes > > Zone PFN ranges: > > DMA 0x00000000 -> 0x00080000 > > Normal 0x00080000 -> 0x00080000 > > Movable zone start PFN for each node > > early_node_map[2] active PFN ranges > > 0: 0x00000000 -> 0x00044000 > > 1: 0x00044000 -> 0x00080000 > > [boot]0015 Setup Done > > Built 2 zonelists in Node order, mobility grouping on. Total pages: > > 517120 > > Policy zone: DMA > > Kernel command line: ro console=hvc0 autobench_args: root=/dev/sda6 > > ABAT:1222922465 > > Experimental hierarchical RCU implementation. > > RCU-based detection of stalled CPUs is enabled. > > Experimental hierarchical RCU init done. > > [boot]0020 XICS Init > > [boot]0021 XICS Done > > PID hash table entries: 4096 (order: 12, 32768 bytes) > > clocksource: timebase mult[7d0000] shift[22] registered > > Console: colour dummy device 80x25 > > console handover: boot [udbg0] -> real [hvc0] > > Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes) > > Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes) > > freeing bootmem node 0 > > freeing bootmem node 1 > > Memory: 2038156k/2097152k available (7200k kernel code, 58996k reserved, > > 1052k data, 513k bss, 292k init) > > SLUB: Genslabs=13, HWalign=128, Order=0-3, MinObjects=0, CPUs=8, > > Nodes=16 > > Calibrating delay loop... 915.45 BogoMIPS (lpj=1830912) > > Mount-cache hash table entries: 256 > > Initializing cgroup subsys ns > > Initializing cgroup subsys cpuacct > > Processor 1 found. > > Processor 2 found. > > Processor 3 found. > > Processor 4 found. > > Processor 5 found. > > Processor 6 found. > > Processor 7 found. > > Brought up 8 CPUs > > net_namespace: 1152 bytes > > NET: Registered protocol family 16 > > IBM eBus Device Driver > > PCI: Probing PCI hardware > > SCSI subsystem initialized > > usbcore: registered new interface driver usbfs > > usbcore: registered new interface driver hub > > usbcore: registered new device driver usb > > NET: Registered protocol family 2 > > IP route cache hash table entries: 65536 (order: 7, 524288 bytes) > > TCP established hash table entries: 262144 (order: 10, 4194304 bytes) > > TCP bind hash table entries: 65536 (order: 8, 1048576 bytes) > > TCP: Hash tables configured (established 262144 bind 65536) > > TCP reno registered > > NET: Registered protocol family 1 > > checking if image is initramfs... it is > > Freeing initrd memory: 2331k freed > > IOMMU table initialized, virtual merging enabled > > audit: initializing netlink socket (disabled) > > type=2000 audit(1222908206.468:1): initialized > > HugeTLB registered 16 MB page size, pre-allocated 0 pages > > HugeTLB registered 16 GB page size, pre-allocated 0 pages > > HugeTLB registered 64 KB page size, pre-allocated 0 pages > > Installing knfsd (copyright (C) 1996 okir@monad.swb.de). > > msgmni has been set to 3985 > > Block layer SCSI generic (bsg) driver version 0.4 loaded (major 254) > > io scheduler noop registered > > io scheduler anticipatory registered (default) > > io scheduler deadline registered > > io scheduler cfq registered > > Generic RTC Driver v1.07 > > Serial: 8250/16550 driver4 ports, IRQ sharing disabled > > brd: module loaded > > loop: module loaded > > Intel(R) PRO/1000 Network Driver - version 7.3.20-k3-NAPI > > Copyright (c) 1999-2006 Intel Corporation. > > pcnet32.c:v1.35 21.Apr.2008 tsbogend@alpha.franken.de > > e100: Intel(R) PRO/100 Network Driver, 3.5.23-k4-NAPI > > e100: Copyright(c) 1999-2006 Intel Corporation > > console [netcon0] enabled > > netconsole: network logging started > > Uniform Multi-Platform E-IDE driver > > ipr: IBM Power RAID SCSI Device Driver version: 2.4.1 (April 24, 2007) > > ibmvscsi 30000003: SRP_VERSION: 16.a > > scsi0 : IBM POWER Virtual SCSI Adapter 1.5.8 > > ibmvscsi 30000003: partner initialization complete > > ibmvscsi 30000003: sent SRP login > > ibmvscsi 30000003: SRP_LOGIN succeeded > > ibmvscsi 30000003: host srp version: 16.a, host partition tundro1 (1), > > OS 2, max io 262144 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi 0:0:1:0: Direct-Access IBM VDASD blkdev 0001 PQ: 0 > > ANSI: 4 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > scsi scan: INQUIRY result too short (5), using 36 > > st: Version 20080504, fixed bufsize 32768, s/g segs 256 > > Driver 'st' needs updating - please use bus_type methods > > Driver 'sd' needs updating - please use bus_type methods > > sd 0:0:1:0: [sda] 73400922 512-byte hardware sectors (37581 MB) > > sd 0:0:1:0: [sda] Write Protect is off > > sd 0:0:1:0: [sda] Write cache: disabled, read cache: disabled, doesn't > > support DPO or FUA > > sd 0:0:1:0: [sda] 73400922 512-byte hardware sectors (37581 MB) > > sd 0:0:1:0: [sda] Write Protect is off > > sd 0:0:1:0: [sda] Write cache: disabled, read cache: disabled, doesn't > > support DPO or FUA > > sda: sda1 sda2 sda3 sda4 < sda5 sda6 sda7 > > > sd 0:0:1:0: [sda] Attached SCSI disk > > Driver 'sr' needs updating - please use bus_type methods > > sd 0:0:1:0: Attached scsi generic sg0 type 0 > > Initializing USB Mass Storage driver... > > usbcore: registered new interface driver usb-storage > > USB Mass Storage support registered. > > mice: PS/2 mouse device common for all mice > > md: linear personality registered for level -1 > > md: raid0 personality registered for level 0 > > md: raid1 personality registered for level 1 > > device-mapper: ioctl: 4.14.0-ioctl (2008-04-23) initialised: > > dm-devel@redhat.com > > usbcore: registered new interface driver hiddev > > usbcore: registered new interface driver usbhid > > usbhid: v2.6:USB HID core driver > > IPv4 over IPv4 tunneling driver > > TCP cubic registered > > NET: Registered protocol family 17 > > RPC: Registered udp transport module. > > RPC: Registered tcp transport module. > > registered taskstats version 1 > > Freeing unused kernel memory: 292k freed > > Red Hat nash version 5.1.19.6 starting > > Mounting proc filesystem > > Mounting sysfs filesystem > > Creating /dev > > Creating initial device nodes > > Setting up hotplug. > > Creating block device nodes. > > Loading ehea.ko module > > IBM eHEA ethernet device driver (Release EHEA_0092) > > Scanning and configuring dmraid supported devices > > Creating root device. > > Mounting root filesystem. > > kjournald starting. Commit interval 5 seconds > > EXT3-fs: mounted filesystem with ordered data mode. > > Setting up other filesystems. > > Setting up new root fs > > no fstab.sys, mounting internal defaults > > Switching to new root and running init. > > unmounting old /dev > > unmounting old /proc > > unmounting old /sys > > INIT: version 2.86 booting > > Welcome to Red Hat Enterprise Linux Server > > Press 'I' to enter interactive startup. > > RCU detected CPU 5 stall (t=1222922609/2) > > Call Trace: > > [c000000042d03920] [c00000000000fdec] .show_stack+0x70/0x184 > > (unreliable) > > [c000000042d039d0] [c0000000000a1de0] .__rcu_pending+0x98/0x2ec > > [c000000042d03a70] [c0000000000a206c] .rcu_pending+0x38/0x88 > > [c000000042d03af0] [c00000000006363c] .update_process_times+0x48/0x94 > > [c000000042d03b80] [c00000000007bae4] .tick_sched_timer+0xbc/0x118 > > [c000000042d03c20] [c000000000073a88] .__run_hrtimer+0x78/0x118 > > [c000000042d03cc0] [c000000000074b3c] .hrtimer_interrupt+0x128/0x1e4 > > [c000000042d03d90] [c00000000002304c] .timer_interrupt+0xe0/0x15c > > [c000000042d03e30] [c000000000003700] decrementer_common+0x100/0x180 > > Setting clock (localtime): Thu Oct 2 00:43:30 EDT 2008 [ OK ] > > Starting udev: [ OK ] > > Setting hostname tundro2.rchland.ibm.com: [ OK ] > > No devices found > > Setting up Logical Volume Management: No volume groups found > > [ OK ] > > Checking filesystems > > Checking all file systems. > > [/sbin/fsck.ext3 (1) -- /] fsck.ext3 -a /dev/sda6 > > /dev/sda6: Superblock last mount time is in the future. FIXED. > > /dev/sda6: clean, 134758/1281696 files, 853722/2560336 blocks > > [ OK ] > > Remounting root filesystem in read-write mode: [ OK ] > > Mounting local filesystems: [ OK ] > > Enabling local filesystem quotas: quotaon: Warning: No quota format > > detected in the kernel. > > [ OK ] > > Enabling /etc/fstab swaps: swapon: /dev/sda5: Invalid argument > > [FAILED] > > INIT: Entering runlevel: 3 > > Entering non-interactive startup > > Starting boot.loadmodules: [ OK ] > > Starting background readahead: [ OK ] > > Bringing up loopback interface: [ OK ] > > Bringing up interface eth0: [ OK ] > > Starting auditd: [ OK ] > > Starting system logger: [ OK ] > > Starting kernel logger: [ OK ] > > Starting irqbalance: [ OK ] > > Starting RPC idmapd: FATAL: Module sunrpc not found. > > FATAL: Error running install command for sunrpc > > Error: RPC MTAB does not exist. > > Starting system message bus: [ OK ] > > Starting Bluetooth services:[ OK ][ OK ]Can't open RFCOMM control > > socket: Address family not supported by protocol > > > > Mounting other filesystems: [ OK ] > > Starting PC/SC smart card daemon (pcscd): [ OK ] > > Starting hidd: Can't open HIDP control socket: Address family not > > supported by protocol > > [FAILED] > > Starting autofs: Loading autofs4: [ OK ] > > Starting automount: [ OK ] > > [ OK ] > > Starting iprinit: Starting ipr initialization daemon[ OK ] > > [ OK ] > > Starting iprupdate: Checking ipr microcode levels > > Completed ipr microcode updates[ OK ] > > [ OK ] > > Starting iprdump: Starting ipr dump daemon[ OK ] > > [ OK ] > > Starting sshd: [ OK ] > > Starting cups: [ OK ] > > Starting xinetd: [ OK ] > > Starting vpdupdate: [ OK ] > > Starting console mouse services: [ OK ] > > Starting crond: [ OK ] > > Starting xfs: [ OK ] > > Starting anacron: [ OK ] > > Starting atd: [ OK ] > > Starting yum-updatesd: [ OK ] > > Starting Avahi daemon... [ OK ] > > Starting HAL daemon: [ OK ] > > Starting smartd: [ OK ] > > > > Red Hat Enterprise Linux Server release 5.2 (Tikanga) > > Kernel 2.6.27-rc7-autokern1 on an ppc64 > > > > tundro2.rchland.ibm.com login:-- 0:conmux-control -- time-stamp -- > > Oct/01/08 21:45:47 -- > > -- 0:conmux-control -- time-stamp -- Oct/01/08 22:01:03 -- > > (bot:conmon-payload) disconnected > > -- > Mathieu Desnoyers > OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU (LTTng support) 2008-10-02 17:43 ` Paul E. McKenney @ 2008-10-02 18:51 ` Mathieu Desnoyers 2008-10-02 22:55 ` Paul E. McKenney 0 siblings, 1 reply; 17+ messages in thread From: Mathieu Desnoyers @ 2008-10-02 18:51 UTC (permalink / raw) To: Paul E. McKenney Cc: Ingo Molnar, linux-kernel, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > On Thu, Oct 02, 2008 at 11:10:46AM -0400, Mathieu Desnoyers wrote: > > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > > > On Thu, Oct 02, 2008 at 10:07:26AM +0200, Ingo Molnar wrote: > > > > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > > > > > > > > > Hello! > > > > > > > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > > > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > > > > which defaults disabled. This is a debugging feature, not something > > > > > that non-kernel-hackers would be expected to care about. This feature > > > > > can detect looping CPUs in !PREEMPT builds and looping CPUs with > > > > > preemption disabled in PREEMPT builds. This is essentially a port of > > > > > this functionality from the treercu patch. > > > > > > > > > > One current shortcoming: on some systems, stalls are detected during > > > > > early boot, when we normally would not care about them. My thought is > > > > > to add a call from late initialization to suppress stall detection > > > > > until the system is well along its way to being booted, but thought I > > > > > should check to see if there might already be something for this > > > > > purpose. > > > > > > > > could you be a bit more specific, why do those warnings show up and why > > > > dont we care about them? There are things like networking that > > > > occasionally do an rcu_sync() and a stall could mean a bootup hang. > > > > > > Hmmm... > > > > > > Good point, I was just falling back on my old "we don't care about RCU > > > stalls in boot-time code" rule from long ago. It is entirely possible > > > (in fact reasonably likely) that this rule no longer applies to Linux as > > > it exists today. So please see below for the console output. Not all > > > systems report this stall. I have recently been running only on Power > > > systems, will fire off on some x86s. My kneejerk reaction was that the > > > "stall" was really due to the clock-setting operation -- RCU stalls are > > > based on get_seconds(). > > > > > > So maybe I need to change the stall-detection code to use jiffies > > > instead. > > > > > > Thoughts? > > > > > > > You could possibly use get monotonic time, which should hopefully be a > > bit better than its NTP-corrected alternatives. > > OK. I thought jiffies were monotonic (though not perfectly correlated > to the passage of time). This is a warning timeout, so does not need > better than a few percent accuracy, as long as time never jumps too far. > Well, jiffies should also be monotonic, but I don't know their exact status wrt dyntick systems. > But what API did you have in mind? > Something like do_posix_clock_monotonic_gettime() in linux/time.h. I think it should be ok after early boot. > > Also, do you have all the tools you need to poinpoint the precise source > > of RCU stalls ? I haven't looked at RCU trace, but I should say adding a > > new tracepoint to instrument RCU callback execution at boot time and to > > trace it with LTTng is close to trivial. (actually, very early kernel > > boot is not so trivial as we would have to create a small kernel module > > to activate lttng tracing, but tracing the early phases of userspace > > bootup is trivial given we can use the userspace API). > > I just dump stack, which historically worked pretty well. > > Could you please tell me more about what you think should be added? > I just created two patches which instrument RCU callbacks and call_rcu in classic and preempt RCU to the -lttng tree to show how simple it is once the LTTng infrastructure is in place. Basically, all you need to do is to pull this tree (branch 2.6.27-rc8-lttng-0.31) : git://git.kernel.org/pub/scm/linux/kernel/git/compudj/linux-2.6-lttng.git And follow : http://ltt.polymtl.ca/svn/trunk/lttv/QUICKSTART Which will basically tell you to enable markers/tracepoints and leave most ltt-related options to default (all in general setup menu). wget http://ltt.polymtl.ca/lttng/ltt-control-0.53-02102008.tar.gz extract, ./configure, make, make install wget http://ltt.polymtl.ca/packages/lttv-0.10.0-pre15-12082008.tar.gz extract, ./configure, make, make install (see QUICKSTART for package dependencies. Will be automatically checked by configure) You may have to add a mount point for debugfs and load some kernel modules (as explained in the QUICKSTART). Don't forget rcu-trace.ko. To take a trace, once you are booted in the kernel : ltt-armall (check that the rcu_* markers are enabled by looking at the "state" field of cat /proc/ltt | grep rcu) lttctl -n trace -d -l /mnt/debugfs/ltt -t /tmp/trace [Reproduce behavior to trace] lttctl -n trace -R # Produce a text dump lttv -m textDump -t /tmp/trace # Produce a text dump filtering only rcu events : lttv -m textDump -t /tmp/trace \ -e "event.name=rcu_classic_callback|event.name=rcu_classic_call_rcu|\ event.name=rcu_classic_call_rcu_bh|event.name=rcu_preempt_callback|\ event.name=rcu_preempt_call_rcu_sched|event.name=rcu_preempt_call_rcu" Then some scripting could be required to turn the function pointers into symbols. Keep you kernel vmlinux around to perform symbol resolution. A quick trick is to use objdump -S --start-address=0x.... vmlinux|less but I am certain there are much much better ways to do it. ;) There is also the gui you can use to dig into the trace : lttv-gui -t /tmp/trace That should help you pinpoint which are the functions that took so much time and which is their associated call_rcu caller IP. If you run a trace with preempt rcu, the textdump should give you an output like this : [...] rcu_preempt_call_rcu: 130.773191675 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } rcu_preempt_call_rcu: 130.773194003 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } rcu_preempt_call_rcu: 130.773195929 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } rcu_preempt_call_rcu: 130.773197399 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } rcu_preempt_call_rcu: 130.773200429 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802c6670, ip = 0xFFFFFFFF8023A45B } rcu_preempt_call_rcu: 130.773213907 (/tmp/trace/cpu_5), 3961, 3961, bash, , 3960, 0x0, SYSCALL { func = 0xffffffff8024a980, ip = 0xFFFFFFFF8023937D } rcu_preempt_call_rcu: 130.773215527 (/tmp/trace/cpu_5), 3961, 3961, bash, , 3960, 0x0, SYSCALL { func = 0xffffffff80238550, ip = 0xFFFFFFFF80239985 } rcu_preempt_callback: 130.789025735 (/tmp/trace/cpu_2), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.789026731 (/tmp/trace/cpu_2), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.789050610 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.789051096 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802c6670 } rcu_preempt_callback: 130.789052083 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff8024a980 } rcu_preempt_callback: 130.789052713 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff80238550 } rcu_preempt_callback: 130.805051097 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.805052318 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.805052822 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.805053434 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 130.805054226 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802c6670 } rcu_preempt_callback: 130.805055339 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff8024a980 } rcu_preempt_callback: 130.805056014 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff80238550 } rcu_preempt_call_rcu: 131.442990122 (/tmp/trace/cpu_7), 3685, 3685, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_call_rcu: 131.443615323 (/tmp/trace/cpu_6), 3687, 3687, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_call_rcu: 131.446861604 (/tmp/trace/cpu_7), 3683, 3683, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_call_rcu: 131.447487425 (/tmp/trace/cpu_5), 3681, 3681, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_callback: 131.473052317 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 131.473078626 (/tmp/trace/cpu_6), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 131.477001607 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 131.477002435 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_call_rcu: 133.443017217 (/tmp/trace/cpu_5), 3681, 3681, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_call_rcu: 133.443636900 (/tmp/trace/cpu_6), 3687, 3687, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_call_rcu: 133.446887423 (/tmp/trace/cpu_7), 3683, 3683, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_call_rcu: 133.447509515 (/tmp/trace/cpu_7), 3685, 3685, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } rcu_preempt_callback: 133.473044696 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 133.473070649 (/tmp/trace/cpu_6), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 133.476994292 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_callback: 133.476994961 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } rcu_preempt_call_rcu: 134.085511140 (/tmp/trace/cpu_5), 3961, 3961, bash, , 3960, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } rcu_preempt_call_rcu: 134.085545237 (/tmp/trace/cpu_5), 3981, 3981, bash, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } rcu_preempt_call_rcu: 134.086124869 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } rcu_preempt_call_rcu: 134.086135078 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } rcu_preempt_call_rcu: 134.086143519 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } rcu_preempt_call_rcu: 134.086206323 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } rcu_preempt_call_rcu: 134.086526166 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } [...] Basically, once you have the LTTng infrastructure in place, all that needs to be added to RCU to do that is this (which is already in the -lttng tree), plus a small LTTng "probe" module. If you want, you could add a tracepoint in your stalled CPU detector and send the data through LTTng so you have a starting point to dig from in large traces. From that point, finding the root cause of long RCU-based CPU stalls becomes trivial. RCU : tracepoint instrumentation Instrument RCU classic and RCU preempt callback execution and call_rcu. Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> --- include/trace/rcu.h | 31 +++++++++++++++++++++++++++++++ kernel/rcuclassic.c | 4 ++++ kernel/rcupreempt.c | 4 ++++ 3 files changed, 39 insertions(+) Index: linux-2.6-lttng/include/trace/rcu.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6-lttng/include/trace/rcu.h 2008-10-02 14:18:05.000000000 -0400 @@ -0,0 +1,31 @@ +#ifndef _TRACE_RCU_H +#define _TRACE_RCU_H + +#include <linux/tracepoint.h> +#include <linux/rcupdate.h> + +DEFINE_TRACE(rcu_classic_callback, + TPPROTO(struct rcu_head *head), + TPARGS(head)); + +DEFINE_TRACE(rcu_classic_call_rcu, + TPPROTO(struct rcu_head *head, unsigned long ip), + TPARGS(head, ip)); + +DEFINE_TRACE(rcu_classic_call_rcu_bh, + TPPROTO(struct rcu_head *head, unsigned long ip), + TPARGS(head, ip)); + +DEFINE_TRACE(rcu_preempt_callback, + TPPROTO(struct rcu_head *head), + TPARGS(head)); + +DEFINE_TRACE(rcu_preempt_call_rcu, + TPPROTO(struct rcu_head *head, unsigned long ip), + TPARGS(head, ip)); + +DEFINE_TRACE(rcu_preempt_call_rcu_sched, + TPPROTO(struct rcu_head *head, unsigned long ip), + TPARGS(head, ip)); + +#endif Index: linux-2.6-lttng/kernel/rcuclassic.c =================================================================== --- linux-2.6-lttng.orig/kernel/rcuclassic.c 2008-10-02 13:44:35.000000000 -0400 +++ linux-2.6-lttng/kernel/rcuclassic.c 2008-10-02 14:16:45.000000000 -0400 @@ -47,6 +47,7 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <trace/rcu.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -138,6 +139,7 @@ void call_rcu(struct rcu_head *head, head->func = func; head->next = NULL; local_irq_save(flags); + trace_rcu_classic_call_rcu(head, _RET_IP_); rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; @@ -174,6 +176,7 @@ void call_rcu_bh(struct rcu_head *head, head->func = func; head->next = NULL; local_irq_save(flags); + trace_rcu_classic_call_rcu_bh(head, _RET_IP_); rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; @@ -232,6 +235,7 @@ static void rcu_do_batch(struct rcu_data while (list) { next = list->next; prefetch(next); + trace_rcu_classic_callback(list); list->func(list); list = next; if (++count >= rdp->blimit) Index: linux-2.6-lttng/kernel/rcupreempt.c =================================================================== --- linux-2.6-lttng.orig/kernel/rcupreempt.c 2008-10-02 13:53:41.000000000 -0400 +++ linux-2.6-lttng/kernel/rcupreempt.c 2008-10-02 14:15:47.000000000 -0400 @@ -57,6 +57,7 @@ #include <linux/byteorder/swabb.h> #include <linux/cpumask.h> #include <linux/rcupreempt_trace.h> +#include <trace/rcu.h> /* * Macro that prevents the compiler from reordering accesses, but does @@ -1108,6 +1109,7 @@ static void rcu_process_callbacks(struct spin_unlock_irqrestore(&rdp->lock, flags); while (list) { next = list->next; + trace_rcu_preempt_callback(list); list->func(list); list = next; RCU_TRACE_ME(rcupreempt_trace_invoke); @@ -1122,6 +1124,7 @@ void call_rcu(struct rcu_head *head, voi head->func = func; head->next = NULL; local_irq_save(flags); + trace_rcu_preempt_call_rcu(head, _RET_IP_); rdp = RCU_DATA_ME(); spin_lock(&rdp->lock); __rcu_advance_callbacks(rdp); @@ -1141,6 +1144,7 @@ void call_rcu_sched(struct rcu_head *hea head->func = func; head->next = NULL; local_irq_save(flags); + trace_rcu_preempt_call_rcu_sched(head, _RET_IP_); rdp = RCU_DATA_ME(); spin_lock(&rdp->lock); *rdp->nextschedtail = head; The LTTng probe module looks like this : LTTng trace RCU probe RCU callback/call_rcu LTTng probe. Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> --- ltt/probes/Makefile | 2 - ltt/probes/rcu-trace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) Index: linux-2.6-lttng/ltt/probes/rcu-trace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6-lttng/ltt/probes/rcu-trace.c 2008-10-02 14:23:17.000000000 -0400 @@ -0,0 +1,54 @@ +/* + * ltt/probes/rcu-trace.c + * + * RCU tracepoint probes. + */ + +#include <linux/module.h> +#include <trace/rcu.h> + +#ifdef CONFIG_CLASSIC_RCU +void probe_rcu_classic_callback(struct rcu_head *head) +{ + trace_mark_tp(rcu_classic_callback, rcu_classic_callback, + probe_rcu_classic_callback, "func %p", head->func); +} + +void probe_rcu_classic_call_rcu(struct rcu_head *head, unsigned long ip) +{ + trace_mark_tp(rcu_classic_call_rcu, rcu_classic_call_rcu, + probe_rcu_classic_call_rcu, "func %p ip 0x%lX", head->func, ip); +} + +void probe_rcu_classic_call_rcu_bh(struct rcu_head *head, unsigned long ip) +{ + trace_mark_tp(rcu_classic_call_rcu_bh, rcu_classic_call_rcu_bh, + probe_rcu_classic_call_rcu_bh, "func %p ip 0x%lX", + head->func, ip); +} +#endif + +#ifdef CONFIG_PREEMPT_RCU +void probe_rcu_preempt_callback(struct rcu_head *head) +{ + trace_mark_tp(rcu_preempt_callback, rcu_preempt_callback, + probe_rcu_preempt_callback, "func %p", head->func); +} + +void probe_rcu_preempt_call_rcu(struct rcu_head *head, unsigned long ip) +{ + trace_mark_tp(rcu_preempt_call_rcu, rcu_preempt_call_rcu, + probe_rcu_preempt_call_rcu, "func %p ip 0x%lX", head->func, ip); +} + +void probe_rcu_preempt_call_rcu_sched(struct rcu_head *head, unsigned long ip) +{ + trace_mark_tp(rcu_preempt_call_rcu_sched, rcu_preempt_call_rcu_sched, + probe_rcu_preempt_call_rcu_sched, "func %p ip 0x%lX", + head->func, ip); +} +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("RCU Tracepoint Probes"); Index: linux-2.6-lttng/ltt/probes/Makefile =================================================================== --- linux-2.6-lttng.orig/ltt/probes/Makefile 2008-10-02 14:11:20.000000000 -0400 +++ linux-2.6-lttng/ltt/probes/Makefile 2008-10-02 14:18:16.000000000 -0400 @@ -9,7 +9,7 @@ CFLAGS_REMOVE_lockdep-trace.o = -pg endif obj-$(CONFIG_LTT_TRACEPROBES) += kernel-trace.o mm-trace.o fs-trace.o \ - ipc-trace.o lockdep-trace.o + ipc-trace.o lockdep-trace.o rcu-trace.o ifeq ($(CONFIG_NET),y) CFLAGS_REMOVE_net-trace.o = -pg obj-$(CONFIG_TRACEPROBES) += net-trace.o Mathieu -- Mathieu Desnoyers OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU (LTTng support) 2008-10-02 18:51 ` [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU (LTTng support) Mathieu Desnoyers @ 2008-10-02 22:55 ` Paul E. McKenney 2008-10-02 23:06 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 0 siblings, 1 reply; 17+ messages in thread From: Paul E. McKenney @ 2008-10-02 22:55 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, linux-kernel, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev On Thu, Oct 02, 2008 at 02:51:15PM -0400, Mathieu Desnoyers wrote: > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > > On Thu, Oct 02, 2008 at 11:10:46AM -0400, Mathieu Desnoyers wrote: > > > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > > > > On Thu, Oct 02, 2008 at 10:07:26AM +0200, Ingo Molnar wrote: > > > > > > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > > > > > > > > > > > Hello! > > > > > > > > > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > > > > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > > > > > which defaults disabled. This is a debugging feature, not something > > > > > > that non-kernel-hackers would be expected to care about. This feature > > > > > > can detect looping CPUs in !PREEMPT builds and looping CPUs with > > > > > > preemption disabled in PREEMPT builds. This is essentially a port of > > > > > > this functionality from the treercu patch. > > > > > > > > > > > > One current shortcoming: on some systems, stalls are detected during > > > > > > early boot, when we normally would not care about them. My thought is > > > > > > to add a call from late initialization to suppress stall detection > > > > > > until the system is well along its way to being booted, but thought I > > > > > > should check to see if there might already be something for this > > > > > > purpose. > > > > > > > > > > could you be a bit more specific, why do those warnings show up and why > > > > > dont we care about them? There are things like networking that > > > > > occasionally do an rcu_sync() and a stall could mean a bootup hang. > > > > > > > > Hmmm... > > > > > > > > Good point, I was just falling back on my old "we don't care about RCU > > > > stalls in boot-time code" rule from long ago. It is entirely possible > > > > (in fact reasonably likely) that this rule no longer applies to Linux as > > > > it exists today. So please see below for the console output. Not all > > > > systems report this stall. I have recently been running only on Power > > > > systems, will fire off on some x86s. My kneejerk reaction was that the > > > > "stall" was really due to the clock-setting operation -- RCU stalls are > > > > based on get_seconds(). > > > > > > > > So maybe I need to change the stall-detection code to use jiffies > > > > instead. > > > > > > > > Thoughts? > > > > > > > > > > You could possibly use get monotonic time, which should hopefully be a > > > bit better than its NTP-corrected alternatives. > > > > OK. I thought jiffies were monotonic (though not perfectly correlated > > to the passage of time). This is a warning timeout, so does not need > > better than a few percent accuracy, as long as time never jumps too far. > > > > Well, jiffies should also be monotonic, but I don't know their exact > status wrt dyntick systems. There will be at least one non-dyntick CPU if there are RCU callbacks pending. If there are no RCU callbacks pending, then RCU won't be doing anything, hence will not be detecting stalls. > > But what API did you have in mind? > > > > Something like do_posix_clock_monotonic_gettime() in linux/time.h. I > think it should be ok after early boot. I like jiffies for this non-super-exacting purpose. ;-) > > > Also, do you have all the tools you need to poinpoint the precise source > > > of RCU stalls ? I haven't looked at RCU trace, but I should say adding a > > > new tracepoint to instrument RCU callback execution at boot time and to > > > trace it with LTTng is close to trivial. (actually, very early kernel > > > boot is not so trivial as we would have to create a small kernel module > > > to activate lttng tracing, but tracing the early phases of userspace > > > bootup is trivial given we can use the userspace API). > > > > I just dump stack, which historically worked pretty well. > > > > Could you please tell me more about what you think should be added? > > > > I just created two patches which instrument RCU callbacks and call_rcu > in classic and preempt RCU to the -lttng tree to show how simple it is > once the LTTng infrastructure is in place. OK, I think I see the disconnection here... When RCU detects a stall, that often indicates that some CPU is spinning in the kernel (for !CONFIG_PREEMPT) or that some CPU is spinning in the kernel with preemption disabled (for CONFIG_PREEMPT). The stack trace will normally show what is spinning. Tracing the RCU activity in this case will not help -- it is not RCU that is spinning, but rather that RCU is stalled waiting for some other part of the kernel that is spinning. (In a past life, RCU once complained about a specific CPU that had managed to fail in a way that left the system running! Except that RCU grace periods never ended...) FWIW, what I normally use for debugging RCU itself is counters rather than log messages. The reason I avoid log messages is that the race conditions I often find myself chasing are reasonably narrow, so that I would have a very large number of log messages to pile through. Nevertheless, I will keep track of these patches in case I come across some situation that is more friendly to log-based debugging -- and thank you for putting them together!!! Thanx, Paul > Basically, all you need to do is to pull this tree (branch > 2.6.27-rc8-lttng-0.31) : > > git://git.kernel.org/pub/scm/linux/kernel/git/compudj/linux-2.6-lttng.git > > And follow : > http://ltt.polymtl.ca/svn/trunk/lttv/QUICKSTART > > Which will basically tell you to enable markers/tracepoints and leave > most ltt-related options to default (all in general setup menu). > > wget http://ltt.polymtl.ca/lttng/ltt-control-0.53-02102008.tar.gz > extract, ./configure, make, make install > > wget http://ltt.polymtl.ca/packages/lttv-0.10.0-pre15-12082008.tar.gz > extract, ./configure, make, make install > (see QUICKSTART for package dependencies. Will be automatically checked > by configure) > > You may have to add a mount point for debugfs and load some kernel > modules (as explained in the QUICKSTART). Don't forget rcu-trace.ko. > > To take a trace, once you are booted in the kernel : > > ltt-armall > (check that the rcu_* markers are enabled by looking at the "state" > field of cat /proc/ltt | grep rcu) > lttctl -n trace -d -l /mnt/debugfs/ltt -t /tmp/trace > [Reproduce behavior to trace] > lttctl -n trace -R > # Produce a text dump > lttv -m textDump -t /tmp/trace > > # Produce a text dump filtering only rcu events : > lttv -m textDump -t /tmp/trace \ > -e "event.name=rcu_classic_callback|event.name=rcu_classic_call_rcu|\ > event.name=rcu_classic_call_rcu_bh|event.name=rcu_preempt_callback|\ > event.name=rcu_preempt_call_rcu_sched|event.name=rcu_preempt_call_rcu" > > Then some scripting could be required to turn the function pointers into > symbols. Keep you kernel vmlinux around to perform symbol resolution. > A quick trick is to use objdump -S --start-address=0x.... vmlinux|less > but I am certain there are much much better ways to do it. ;) > > There is also the gui you can use to dig into the trace : > lttv-gui -t /tmp/trace > > That should help you pinpoint which are the functions that took so much > time and which is their associated call_rcu caller IP. > > If you run a trace with preempt rcu, the textdump should give you an > output like this : > > [...] > rcu_preempt_call_rcu: 130.773191675 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > rcu_preempt_call_rcu: 130.773194003 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > rcu_preempt_call_rcu: 130.773195929 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > rcu_preempt_call_rcu: 130.773197399 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > rcu_preempt_call_rcu: 130.773200429 (/tmp/trace/cpu_5), 3977, 3977, lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802c6670, ip = 0xFFFFFFFF8023A45B } > rcu_preempt_call_rcu: 130.773213907 (/tmp/trace/cpu_5), 3961, 3961, bash, , 3960, 0x0, SYSCALL { func = 0xffffffff8024a980, ip = 0xFFFFFFFF8023937D } > rcu_preempt_call_rcu: 130.773215527 (/tmp/trace/cpu_5), 3961, 3961, bash, , 3960, 0x0, SYSCALL { func = 0xffffffff80238550, ip = 0xFFFFFFFF80239985 } > rcu_preempt_callback: 130.789025735 (/tmp/trace/cpu_2), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.789026731 (/tmp/trace/cpu_2), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.789050610 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.789051096 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802c6670 } > rcu_preempt_callback: 130.789052083 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff8024a980 } > rcu_preempt_callback: 130.789052713 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff80238550 } > rcu_preempt_callback: 130.805051097 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.805052318 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.805052822 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.805053434 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 130.805054226 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802c6670 } > rcu_preempt_callback: 130.805055339 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff8024a980 } > rcu_preempt_callback: 130.805056014 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff80238550 } > rcu_preempt_call_rcu: 131.442990122 (/tmp/trace/cpu_7), 3685, 3685, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_call_rcu: 131.443615323 (/tmp/trace/cpu_6), 3687, 3687, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_call_rcu: 131.446861604 (/tmp/trace/cpu_7), 3683, 3683, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_call_rcu: 131.447487425 (/tmp/trace/cpu_5), 3681, 3681, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_callback: 131.473052317 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 131.473078626 (/tmp/trace/cpu_6), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 131.477001607 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 131.477002435 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_call_rcu: 133.443017217 (/tmp/trace/cpu_5), 3681, 3681, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_call_rcu: 133.443636900 (/tmp/trace/cpu_6), 3687, 3687, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_call_rcu: 133.446887423 (/tmp/trace/cpu_7), 3683, 3683, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_call_rcu: 133.447509515 (/tmp/trace/cpu_7), 3685, 3685, hald-addon-stor, , 3634, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AA8D9 } > rcu_preempt_callback: 133.473044696 (/tmp/trace/cpu_5), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 133.473070649 (/tmp/trace/cpu_6), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 133.476994292 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_callback: 133.476994961 (/tmp/trace/cpu_7), 0, 0, swapper, , 0, 0x0, SOFTIRQ { func = 0xffffffff802addc0 } > rcu_preempt_call_rcu: 134.085511140 (/tmp/trace/cpu_5), 3961, 3961, bash, , 3960, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > rcu_preempt_call_rcu: 134.085545237 (/tmp/trace/cpu_5), 3981, 3981, bash, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > rcu_preempt_call_rcu: 134.086124869 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } > rcu_preempt_call_rcu: 134.086135078 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } > rcu_preempt_call_rcu: 134.086143519 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } > rcu_preempt_call_rcu: 134.086206323 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802B8E2D } > rcu_preempt_call_rcu: 134.086526166 (/tmp/trace/cpu_2), 3981, 3981, ./lttctl, , 3961, 0x0, SYSCALL { func = 0xffffffff802addc0, ip = 0xFFFFFFFF802AE105 } > [...] > > Basically, once you have the LTTng infrastructure in place, all that > needs to be added to RCU to do that is this (which is already in the > -lttng tree), plus a small LTTng "probe" module. If you want, you could > add a tracepoint in your stalled CPU detector and send the data through > LTTng so you have a starting point to dig from in large traces. From > that point, finding the root cause of long RCU-based CPU stalls becomes > trivial. > > > RCU : tracepoint instrumentation > > Instrument RCU classic and RCU preempt callback execution and call_rcu. > > Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> > CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> > --- > include/trace/rcu.h | 31 +++++++++++++++++++++++++++++++ > kernel/rcuclassic.c | 4 ++++ > kernel/rcupreempt.c | 4 ++++ > 3 files changed, 39 insertions(+) > > Index: linux-2.6-lttng/include/trace/rcu.h > =================================================================== > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ linux-2.6-lttng/include/trace/rcu.h 2008-10-02 14:18:05.000000000 -0400 > @@ -0,0 +1,31 @@ > +#ifndef _TRACE_RCU_H > +#define _TRACE_RCU_H > + > +#include <linux/tracepoint.h> > +#include <linux/rcupdate.h> > + > +DEFINE_TRACE(rcu_classic_callback, > + TPPROTO(struct rcu_head *head), > + TPARGS(head)); > + > +DEFINE_TRACE(rcu_classic_call_rcu, > + TPPROTO(struct rcu_head *head, unsigned long ip), > + TPARGS(head, ip)); > + > +DEFINE_TRACE(rcu_classic_call_rcu_bh, > + TPPROTO(struct rcu_head *head, unsigned long ip), > + TPARGS(head, ip)); > + > +DEFINE_TRACE(rcu_preempt_callback, > + TPPROTO(struct rcu_head *head), > + TPARGS(head)); > + > +DEFINE_TRACE(rcu_preempt_call_rcu, > + TPPROTO(struct rcu_head *head, unsigned long ip), > + TPARGS(head, ip)); > + > +DEFINE_TRACE(rcu_preempt_call_rcu_sched, > + TPPROTO(struct rcu_head *head, unsigned long ip), > + TPARGS(head, ip)); > + > +#endif > Index: linux-2.6-lttng/kernel/rcuclassic.c > =================================================================== > --- linux-2.6-lttng.orig/kernel/rcuclassic.c 2008-10-02 13:44:35.000000000 -0400 > +++ linux-2.6-lttng/kernel/rcuclassic.c 2008-10-02 14:16:45.000000000 -0400 > @@ -47,6 +47,7 @@ > #include <linux/notifier.h> > #include <linux/cpu.h> > #include <linux/mutex.h> > +#include <trace/rcu.h> > > #ifdef CONFIG_DEBUG_LOCK_ALLOC > static struct lock_class_key rcu_lock_key; > @@ -138,6 +139,7 @@ void call_rcu(struct rcu_head *head, > head->func = func; > head->next = NULL; > local_irq_save(flags); > + trace_rcu_classic_call_rcu(head, _RET_IP_); > rdp = &__get_cpu_var(rcu_data); > *rdp->nxttail = head; > rdp->nxttail = &head->next; > @@ -174,6 +176,7 @@ void call_rcu_bh(struct rcu_head *head, > head->func = func; > head->next = NULL; > local_irq_save(flags); > + trace_rcu_classic_call_rcu_bh(head, _RET_IP_); > rdp = &__get_cpu_var(rcu_bh_data); > *rdp->nxttail = head; > rdp->nxttail = &head->next; > @@ -232,6 +235,7 @@ static void rcu_do_batch(struct rcu_data > while (list) { > next = list->next; > prefetch(next); > + trace_rcu_classic_callback(list); > list->func(list); > list = next; > if (++count >= rdp->blimit) > Index: linux-2.6-lttng/kernel/rcupreempt.c > =================================================================== > --- linux-2.6-lttng.orig/kernel/rcupreempt.c 2008-10-02 13:53:41.000000000 -0400 > +++ linux-2.6-lttng/kernel/rcupreempt.c 2008-10-02 14:15:47.000000000 -0400 > @@ -57,6 +57,7 @@ > #include <linux/byteorder/swabb.h> > #include <linux/cpumask.h> > #include <linux/rcupreempt_trace.h> > +#include <trace/rcu.h> > > /* > * Macro that prevents the compiler from reordering accesses, but does > @@ -1108,6 +1109,7 @@ static void rcu_process_callbacks(struct > spin_unlock_irqrestore(&rdp->lock, flags); > while (list) { > next = list->next; > + trace_rcu_preempt_callback(list); > list->func(list); > list = next; > RCU_TRACE_ME(rcupreempt_trace_invoke); > @@ -1122,6 +1124,7 @@ void call_rcu(struct rcu_head *head, voi > head->func = func; > head->next = NULL; > local_irq_save(flags); > + trace_rcu_preempt_call_rcu(head, _RET_IP_); > rdp = RCU_DATA_ME(); > spin_lock(&rdp->lock); > __rcu_advance_callbacks(rdp); > @@ -1141,6 +1144,7 @@ void call_rcu_sched(struct rcu_head *hea > head->func = func; > head->next = NULL; > local_irq_save(flags); > + trace_rcu_preempt_call_rcu_sched(head, _RET_IP_); > rdp = RCU_DATA_ME(); > spin_lock(&rdp->lock); > *rdp->nextschedtail = head; > > The LTTng probe module looks like this : > > > LTTng trace RCU probe > > RCU callback/call_rcu LTTng probe. > > Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> > CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> > --- > ltt/probes/Makefile | 2 - > ltt/probes/rcu-trace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 55 insertions(+), 1 deletion(-) > > Index: linux-2.6-lttng/ltt/probes/rcu-trace.c > =================================================================== > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ linux-2.6-lttng/ltt/probes/rcu-trace.c 2008-10-02 14:23:17.000000000 -0400 > @@ -0,0 +1,54 @@ > +/* > + * ltt/probes/rcu-trace.c > + * > + * RCU tracepoint probes. > + */ > + > +#include <linux/module.h> > +#include <trace/rcu.h> > + > +#ifdef CONFIG_CLASSIC_RCU > +void probe_rcu_classic_callback(struct rcu_head *head) > +{ > + trace_mark_tp(rcu_classic_callback, rcu_classic_callback, > + probe_rcu_classic_callback, "func %p", head->func); > +} > + > +void probe_rcu_classic_call_rcu(struct rcu_head *head, unsigned long ip) > +{ > + trace_mark_tp(rcu_classic_call_rcu, rcu_classic_call_rcu, > + probe_rcu_classic_call_rcu, "func %p ip 0x%lX", head->func, ip); > +} > + > +void probe_rcu_classic_call_rcu_bh(struct rcu_head *head, unsigned long ip) > +{ > + trace_mark_tp(rcu_classic_call_rcu_bh, rcu_classic_call_rcu_bh, > + probe_rcu_classic_call_rcu_bh, "func %p ip 0x%lX", > + head->func, ip); > +} > +#endif > + > +#ifdef CONFIG_PREEMPT_RCU > +void probe_rcu_preempt_callback(struct rcu_head *head) > +{ > + trace_mark_tp(rcu_preempt_callback, rcu_preempt_callback, > + probe_rcu_preempt_callback, "func %p", head->func); > +} > + > +void probe_rcu_preempt_call_rcu(struct rcu_head *head, unsigned long ip) > +{ > + trace_mark_tp(rcu_preempt_call_rcu, rcu_preempt_call_rcu, > + probe_rcu_preempt_call_rcu, "func %p ip 0x%lX", head->func, ip); > +} > + > +void probe_rcu_preempt_call_rcu_sched(struct rcu_head *head, unsigned long ip) > +{ > + trace_mark_tp(rcu_preempt_call_rcu_sched, rcu_preempt_call_rcu_sched, > + probe_rcu_preempt_call_rcu_sched, "func %p ip 0x%lX", > + head->func, ip); > +} > +#endif > + > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Mathieu Desnoyers"); > +MODULE_DESCRIPTION("RCU Tracepoint Probes"); > Index: linux-2.6-lttng/ltt/probes/Makefile > =================================================================== > --- linux-2.6-lttng.orig/ltt/probes/Makefile 2008-10-02 14:11:20.000000000 -0400 > +++ linux-2.6-lttng/ltt/probes/Makefile 2008-10-02 14:18:16.000000000 -0400 > @@ -9,7 +9,7 @@ CFLAGS_REMOVE_lockdep-trace.o = -pg > endif > > obj-$(CONFIG_LTT_TRACEPROBES) += kernel-trace.o mm-trace.o fs-trace.o \ > - ipc-trace.o lockdep-trace.o > + ipc-trace.o lockdep-trace.o rcu-trace.o > ifeq ($(CONFIG_NET),y) > CFLAGS_REMOVE_net-trace.o = -pg > obj-$(CONFIG_TRACEPROBES) += net-trace.o > > > Mathieu > > -- > Mathieu Desnoyers > OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 22:55 ` Paul E. McKenney @ 2008-10-02 23:06 ` Paul E. McKenney 2008-10-03 4:41 ` Mathieu Desnoyers 2008-10-03 8:39 ` Ingo Molnar 0 siblings, 2 replies; 17+ messages in thread From: Paul E. McKenney @ 2008-10-02 23:06 UTC (permalink / raw) To: linux-kernel Cc: Ingo Molnar, mathieu.desnoyers, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev Hello! This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/linux/rcuclassic.h | 12 ++- kernel/rcuclassic.c | 166 +++++++++++++++++++++++---------------------- lib/Kconfig.debug | 2 3 files changed, 96 insertions(+), 84 deletions(-) diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 29bf528..2d72d20 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -40,15 +40,21 @@ #include <linux/cpumask.h> #include <linux/seqlock.h> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ long pending; /* Number of the last pending batch */ -#ifdef CONFIG_DEBUG_RCU_STALL - unsigned long gp_check; /* Time grace period should end, in seconds. */ -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started in jiffies. */ + unsigned long jiffies_stall; + /* Time at which to check for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ int signaled; diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128..eae2fb6 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, } } +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * period (if necessary). */ -#ifdef CONFIG_DEBUG_RCU_STALL - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_check = get_seconds() + 3; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = get_seconds() - rcp->gp_check; - if (delta < 2L || cpus_empty(rcp->cpumask)) { - spin_unlock(&rcp->lock); - return; - } - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_cpu_mask(cpu, rcp->cpumask) - printk(" %d", cpu); - printk(" (detected by %d, t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(get_seconds() - rcp->gp_check) >= 0L) - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - long delta; - - delta = get_seconds() - rcp->gp_check; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { - - /* We haven't checked in, so go dump stack. */ - - print_cpu_stall(rcp); - - } else { - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } - } -} - -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; - record_gp_check_time(rcp); + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp, rdp); + check_cpu_stall(rcp); if (rdp->nxtlist) { long completed_snap = ACCESS_ONCE(rcp->completed); @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4e921a8..e0e0582 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -616,7 +616,7 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. -config RCU_CPU_STALL +config RCU_CPU_STALL_DETECTOR bool "Check for stalled CPUs delaying RCU grace periods" depends on CLASSIC_RCU default n ^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 23:06 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney @ 2008-10-03 4:41 ` Mathieu Desnoyers 2008-10-03 5:02 ` Paul E. McKenney 2008-10-03 8:39 ` Ingo Molnar 1 sibling, 1 reply; 17+ messages in thread From: Mathieu Desnoyers @ 2008-10-03 4:41 UTC (permalink / raw) To: Paul E. McKenney Cc: linux-kernel, Ingo Molnar, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > Hello! > > This patch adds stalled-CPU detection to Classic RCU. This capability > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which > defaults disabled. This is a debugging feature to detect infinite loops > in kernel code, not something that non-kernel-hackers would be expected > to care about. This feature can detect looping CPUs in !PREEMPT builds > and looping CPUs with preemption disabled in PREEMPT builds. This is > essentially a port of this functionality from the treercu patch, replacing > the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). > > The changes from the patch in tip/core/rcu include making the config > variable name match that in treercu, changing from seconds to jiffies to > avoid spurious warnings, and printing a boot message when this feature > is enabled. > Hi Paul, Thanks for the previous explanations. Out of curiosity, what can this patch do that the nmi watchdog can't do ? Mathieu > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > --- > > include/linux/rcuclassic.h | 12 ++- > kernel/rcuclassic.c | 166 +++++++++++++++++++++++---------------------- > lib/Kconfig.debug | 2 > 3 files changed, 96 insertions(+), 84 deletions(-) > > diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h > index 29bf528..2d72d20 100644 > --- a/include/linux/rcuclassic.h > +++ b/include/linux/rcuclassic.h > @@ -40,15 +40,21 @@ > #include <linux/cpumask.h> > #include <linux/seqlock.h> > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ > +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > /* Global control variables for rcupdate callback mechanism. */ > struct rcu_ctrlblk { > long cur; /* Current batch number. */ > long completed; /* Number of the last completed batch */ > long pending; /* Number of the last pending batch */ > -#ifdef CONFIG_DEBUG_RCU_STALL > - unsigned long gp_check; /* Time grace period should end, in seconds. */ > -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > + unsigned long gp_start; /* Time at which GP started in jiffies. */ > + unsigned long jiffies_stall; > + /* Time at which to check for CPU stalls. */ > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > int signaled; > > diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c > index ed15128..eae2fb6 100644 > --- a/kernel/rcuclassic.c > +++ b/kernel/rcuclassic.c > @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, > } > } > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > + > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > +{ > + rcp->gp_start = jiffies; > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; > +} > + > +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) > +{ > + int cpu; > + long delta; > + unsigned long flags; > + > + /* Only let one CPU complain about others per time interval. */ > + > + spin_lock_irqsave(&rcp->lock, flags); > + delta = jiffies - rcp->jiffies_stall; > + if (delta < 2 || rcp->cur != rcp->completed) { > + spin_unlock_irqrestore(&rcp->lock, flags); > + return; > + } > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > + spin_unlock_irqrestore(&rcp->lock, flags); > + > + /* OK, time to rat on our buddy... */ > + > + printk(KERN_ERR "RCU detected CPU stalls:"); > + for_each_possible_cpu(cpu) { > + if (cpu_isset(cpu, rcp->cpumask)) > + printk(" %d", cpu); > + } > + printk(" (detected by %d, t=%ld jiffies)\n", > + smp_processor_id(), (long)(jiffies - rcp->gp_start)); > +} > + > +static void print_cpu_stall(struct rcu_ctrlblk *rcp) > +{ > + unsigned long flags; > + > + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", > + smp_processor_id(), jiffies, > + jiffies - rcp->gp_start); > + dump_stack(); > + spin_lock_irqsave(&rcp->lock, flags); > + if ((long)(jiffies - rcp->jiffies_stall) >= 0) > + rcp->jiffies_stall = > + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > + spin_unlock_irqrestore(&rcp->lock, flags); > + set_need_resched(); /* kick ourselves to get things going. */ > +} > + > +static void check_cpu_stall(struct rcu_ctrlblk *rcp) > +{ > + long delta; > + > + delta = jiffies - rcp->jiffies_stall; > + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { > + > + /* We haven't checked in, so go dump stack. */ > + print_cpu_stall(rcp); > + > + } else if (rcp->cur != rcp->completed && delta >= 2) { > + > + /* They had two seconds to dump stack, so complain. */ > + print_other_cpu_stall(rcp); > + } > +} > + > +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > + > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > +{ > +} > + > +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > +{ > +} > + > +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > + > /** > * call_rcu - Queue an RCU callback for invocation after a grace period. > * @head: structure to be used for queueing the RCU updates. > @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) > * period (if necessary). > */ > > -#ifdef CONFIG_DEBUG_RCU_STALL > - > -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) > -{ > - rcp->gp_check = get_seconds() + 3; > -} > - > -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) > -{ > - int cpu; > - long delta; > - unsigned long flags; > - > - /* Only let one CPU complain about others per time interval. */ > - > - spin_lock_irqsave(&rcp->lock, flags); > - delta = get_seconds() - rcp->gp_check; > - if (delta < 2L || cpus_empty(rcp->cpumask)) { > - spin_unlock(&rcp->lock); > - return; > - } > - rcp->gp_check = get_seconds() + 30; > - spin_unlock_irqrestore(&rcp->lock, flags); > - > - /* OK, time to rat on our buddy... */ > - > - printk(KERN_ERR "RCU detected CPU stalls:"); > - for_each_cpu_mask(cpu, rcp->cpumask) > - printk(" %d", cpu); > - printk(" (detected by %d, t=%lu/%lu)\n", > - smp_processor_id(), get_seconds(), rcp->gp_check); > -} > - > -static void print_cpu_stall(struct rcu_ctrlblk *rcp) > -{ > - unsigned long flags; > - > - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", > - smp_processor_id(), get_seconds(), rcp->gp_check); > - dump_stack(); > - spin_lock_irqsave(&rcp->lock, flags); > - if ((long)(get_seconds() - rcp->gp_check) >= 0L) > - rcp->gp_check = get_seconds() + 30; > - spin_unlock_irqrestore(&rcp->lock, flags); > -} > - > -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > -{ > - long delta; > - > - delta = get_seconds() - rcp->gp_check; > - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { > - > - /* We haven't checked in, so go dump stack. */ > - > - print_cpu_stall(rcp); > - > - } else { > - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { > - /* They had two seconds to dump stack, so complain. */ > - print_other_cpu_stall(rcp); > - } > - } > -} > - > -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ > - > -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) > -{ > -} > - > -static inline void > -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > -{ > -} > - > -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ > - > /* > * Register a new batch of callbacks, and start it up if there is currently no > * active batch and the batch to be registered has not already occurred. > @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) > if (rcp->cur != rcp->pending && > rcp->completed == rcp->cur) { > rcp->cur++; > - record_gp_check_time(rcp); > + record_gp_stall_check_time(rcp); > > /* > * Accessing nohz_cpu_mask before incrementing rcp->cur needs a > @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) > static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > { > /* Check for CPU stalls, if enabled. */ > - check_cpu_stall(rcp, rdp); > + check_cpu_stall(rcp); > > if (rdp->nxtlist) { > long completed_snap = ACCESS_ONCE(rcp->completed); > @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { > */ > void __init __rcu_init(void) > { > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, > (void *)(long)smp_processor_id()); > /* Register notifier for non-boot CPUs */ > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > index 4e921a8..e0e0582 100644 > --- a/lib/Kconfig.debug > +++ b/lib/Kconfig.debug > @@ -616,7 +616,7 @@ config RCU_TORTURE_TEST_RUNNABLE > Say N here if you want the RCU torture tests to start only > after being manually enabled via /proc. > > -config RCU_CPU_STALL > +config RCU_CPU_STALL_DETECTOR > bool "Check for stalled CPUs delaying RCU grace periods" > depends on CLASSIC_RCU > default n -- Mathieu Desnoyers OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU 2008-10-03 4:41 ` Mathieu Desnoyers @ 2008-10-03 5:02 ` Paul E. McKenney 0 siblings, 0 replies; 17+ messages in thread From: Paul E. McKenney @ 2008-10-03 5:02 UTC (permalink / raw) To: Mathieu Desnoyers Cc: linux-kernel, Ingo Molnar, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev On Fri, Oct 03, 2008 at 12:41:27AM -0400, Mathieu Desnoyers wrote: > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote: > > Hello! > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which > > defaults disabled. This is a debugging feature to detect infinite loops > > in kernel code, not something that non-kernel-hackers would be expected > > to care about. This feature can detect looping CPUs in !PREEMPT builds > > and looping CPUs with preemption disabled in PREEMPT builds. This is > > essentially a port of this functionality from the treercu patch, replacing > > the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). > > > > The changes from the patch in tip/core/rcu include making the config > > variable name match that in treercu, changing from seconds to jiffies to > > avoid spurious warnings, and printing a boot message when this feature > > is enabled. > > > > Hi Paul, > > Thanks for the previous explanations. Out of curiosity, what can this > patch do that the nmi watchdog can't do ? Hello, Mathieu, Operate on architectures that don't have NMIs. Possibly allow shorter timeouts than the NMI watchdog. Operate on a per-CPU basis (or does the NMI watchdog do that these days?). Thanx, Paul > Mathieu > > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > --- > > > > include/linux/rcuclassic.h | 12 ++- > > kernel/rcuclassic.c | 166 +++++++++++++++++++++++---------------------- > > lib/Kconfig.debug | 2 > > 3 files changed, 96 insertions(+), 84 deletions(-) > > > > diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h > > index 29bf528..2d72d20 100644 > > --- a/include/linux/rcuclassic.h > > +++ b/include/linux/rcuclassic.h > > @@ -40,15 +40,21 @@ > > #include <linux/cpumask.h> > > #include <linux/seqlock.h> > > > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ > > +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ > > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > > > /* Global control variables for rcupdate callback mechanism. */ > > struct rcu_ctrlblk { > > long cur; /* Current batch number. */ > > long completed; /* Number of the last completed batch */ > > long pending; /* Number of the last pending batch */ > > -#ifdef CONFIG_DEBUG_RCU_STALL > > - unsigned long gp_check; /* Time grace period should end, in seconds. */ > > -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > + unsigned long gp_start; /* Time at which GP started in jiffies. */ > > + unsigned long jiffies_stall; > > + /* Time at which to check for CPU stalls. */ > > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > > > int signaled; > > > > diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c > > index ed15128..eae2fb6 100644 > > --- a/kernel/rcuclassic.c > > +++ b/kernel/rcuclassic.c > > @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, > > } > > } > > > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > + > > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > > +{ > > + rcp->gp_start = jiffies; > > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; > > +} > > + > > +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) > > +{ > > + int cpu; > > + long delta; > > + unsigned long flags; > > + > > + /* Only let one CPU complain about others per time interval. */ > > + > > + spin_lock_irqsave(&rcp->lock, flags); > > + delta = jiffies - rcp->jiffies_stall; > > + if (delta < 2 || rcp->cur != rcp->completed) { > > + spin_unlock_irqrestore(&rcp->lock, flags); > > + return; > > + } > > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > > + spin_unlock_irqrestore(&rcp->lock, flags); > > + > > + /* OK, time to rat on our buddy... */ > > + > > + printk(KERN_ERR "RCU detected CPU stalls:"); > > + for_each_possible_cpu(cpu) { > > + if (cpu_isset(cpu, rcp->cpumask)) > > + printk(" %d", cpu); > > + } > > + printk(" (detected by %d, t=%ld jiffies)\n", > > + smp_processor_id(), (long)(jiffies - rcp->gp_start)); > > +} > > + > > +static void print_cpu_stall(struct rcu_ctrlblk *rcp) > > +{ > > + unsigned long flags; > > + > > + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", > > + smp_processor_id(), jiffies, > > + jiffies - rcp->gp_start); > > + dump_stack(); > > + spin_lock_irqsave(&rcp->lock, flags); > > + if ((long)(jiffies - rcp->jiffies_stall) >= 0) > > + rcp->jiffies_stall = > > + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > > + spin_unlock_irqrestore(&rcp->lock, flags); > > + set_need_resched(); /* kick ourselves to get things going. */ > > +} > > + > > +static void check_cpu_stall(struct rcu_ctrlblk *rcp) > > +{ > > + long delta; > > + > > + delta = jiffies - rcp->jiffies_stall; > > + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { > > + > > + /* We haven't checked in, so go dump stack. */ > > + print_cpu_stall(rcp); > > + > > + } else if (rcp->cur != rcp->completed && delta >= 2) { > > + > > + /* They had two seconds to dump stack, so complain. */ > > + print_other_cpu_stall(rcp); > > + } > > +} > > + > > +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > + > > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > > +{ > > +} > > + > > +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > > +{ > > +} > > + > > +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > + > > /** > > * call_rcu - Queue an RCU callback for invocation after a grace period. > > * @head: structure to be used for queueing the RCU updates. > > @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) > > * period (if necessary). > > */ > > > > -#ifdef CONFIG_DEBUG_RCU_STALL > > - > > -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) > > -{ > > - rcp->gp_check = get_seconds() + 3; > > -} > > - > > -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) > > -{ > > - int cpu; > > - long delta; > > - unsigned long flags; > > - > > - /* Only let one CPU complain about others per time interval. */ > > - > > - spin_lock_irqsave(&rcp->lock, flags); > > - delta = get_seconds() - rcp->gp_check; > > - if (delta < 2L || cpus_empty(rcp->cpumask)) { > > - spin_unlock(&rcp->lock); > > - return; > > - } > > - rcp->gp_check = get_seconds() + 30; > > - spin_unlock_irqrestore(&rcp->lock, flags); > > - > > - /* OK, time to rat on our buddy... */ > > - > > - printk(KERN_ERR "RCU detected CPU stalls:"); > > - for_each_cpu_mask(cpu, rcp->cpumask) > > - printk(" %d", cpu); > > - printk(" (detected by %d, t=%lu/%lu)\n", > > - smp_processor_id(), get_seconds(), rcp->gp_check); > > -} > > - > > -static void print_cpu_stall(struct rcu_ctrlblk *rcp) > > -{ > > - unsigned long flags; > > - > > - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", > > - smp_processor_id(), get_seconds(), rcp->gp_check); > > - dump_stack(); > > - spin_lock_irqsave(&rcp->lock, flags); > > - if ((long)(get_seconds() - rcp->gp_check) >= 0L) > > - rcp->gp_check = get_seconds() + 30; > > - spin_unlock_irqrestore(&rcp->lock, flags); > > -} > > - > > -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > > -{ > > - long delta; > > - > > - delta = get_seconds() - rcp->gp_check; > > - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { > > - > > - /* We haven't checked in, so go dump stack. */ > > - > > - print_cpu_stall(rcp); > > - > > - } else { > > - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { > > - /* They had two seconds to dump stack, so complain. */ > > - print_other_cpu_stall(rcp); > > - } > > - } > > -} > > - > > -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ > > - > > -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) > > -{ > > -} > > - > > -static inline void > > -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > > -{ > > -} > > - > > -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ > > - > > /* > > * Register a new batch of callbacks, and start it up if there is currently no > > * active batch and the batch to be registered has not already occurred. > > @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) > > if (rcp->cur != rcp->pending && > > rcp->completed == rcp->cur) { > > rcp->cur++; > > - record_gp_check_time(rcp); > > + record_gp_stall_check_time(rcp); > > > > /* > > * Accessing nohz_cpu_mask before incrementing rcp->cur needs a > > @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) > > static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > > { > > /* Check for CPU stalls, if enabled. */ > > - check_cpu_stall(rcp, rdp); > > + check_cpu_stall(rcp); > > > > if (rdp->nxtlist) { > > long completed_snap = ACCESS_ONCE(rcp->completed); > > @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { > > */ > > void __init __rcu_init(void) > > { > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); > > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, > > (void *)(long)smp_processor_id()); > > /* Register notifier for non-boot CPUs */ > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > index 4e921a8..e0e0582 100644 > > --- a/lib/Kconfig.debug > > +++ b/lib/Kconfig.debug > > @@ -616,7 +616,7 @@ config RCU_TORTURE_TEST_RUNNABLE > > Say N here if you want the RCU torture tests to start only > > after being manually enabled via /proc. > > > > -config RCU_CPU_STALL > > +config RCU_CPU_STALL_DETECTOR > > bool "Check for stalled CPUs delaying RCU grace periods" > > depends on CLASSIC_RCU > > default n > > -- > Mathieu Desnoyers > OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 23:06 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 2008-10-03 4:41 ` Mathieu Desnoyers @ 2008-10-03 8:39 ` Ingo Molnar 2008-10-03 8:42 ` [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix Ingo Molnar 2008-10-03 15:13 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 1 sibling, 2 replies; 17+ messages in thread From: Ingo Molnar @ 2008-10-03 8:39 UTC (permalink / raw) To: Paul E. McKenney Cc: linux-kernel, mathieu.desnoyers, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > Hello! > > This patch adds stalled-CPU detection to Classic RCU. This capability > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > which defaults disabled. This is a debugging feature to detect > infinite loops in kernel code, not something that non-kernel-hackers > would be expected to care about. This feature can detect looping CPUs > in !PREEMPT builds and looping CPUs with preemption disabled in > PREEMPT builds. This is essentially a port of this functionality from > the treercu patch, replacing the stall debug patch that is already in > tip/core/rcu (commit 67182ae1c4). > > The changes from the patch in tip/core/rcu include making the config > variable name match that in treercu, changing from seconds to jiffies > to avoid spurious warnings, and printing a boot message when this > feature is enabled. > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> applied to tip/core/rcu, thanks Paul! i tidied up two tiny details noticed by scripts/checkpatch.pl: > +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ > +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ adding paranthesis around the expression. > + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { > + and have removed the two tailing tabs from this line. Ingo ^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix 2008-10-03 8:39 ` Ingo Molnar @ 2008-10-03 8:42 ` Ingo Molnar 2008-10-03 15:14 ` Paul E. McKenney 2008-10-03 15:13 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 1 sibling, 1 reply; 17+ messages in thread From: Ingo Molnar @ 2008-10-03 8:42 UTC (permalink / raw) To: Paul E. McKenney Cc: linux-kernel, mathieu.desnoyers, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev -tip testing found a minor build bug - find the fix below. Ingo --------------------> >From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 3 Oct 2008 10:41:00 +0200 Subject: [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix fix the !CONFIG_RCU_CPU_STALL_DETECTOR path: kernel/rcuclassic.c: In function '__rcu_pending': kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- kernel/rcuclassic.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 0d07e6e..37f72e5 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) { } -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) { } ^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix 2008-10-03 8:42 ` [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix Ingo Molnar @ 2008-10-03 15:14 ` Paul E. McKenney 0 siblings, 0 replies; 17+ messages in thread From: Paul E. McKenney @ 2008-10-03 15:14 UTC (permalink / raw) To: Ingo Molnar Cc: linux-kernel, mathieu.desnoyers, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev On Fri, Oct 03, 2008 at 10:42:33AM +0200, Ingo Molnar wrote: > > -tip testing found a minor build bug - find the fix below. > > Ingo > > --------------------> > >From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001 > From: Ingo Molnar <mingo@elte.hu> > Date: Fri, 3 Oct 2008 10:41:00 +0200 > Subject: [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix > > fix the !CONFIG_RCU_CPU_STALL_DETECTOR path: > > kernel/rcuclassic.c: In function '__rcu_pending': > kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall' > > Signed-off-by: Ingo Molnar <mingo@elte.hu> > --- > kernel/rcuclassic.c | 2 +- > 1 files changed, 1 insertions(+), 1 deletions(-) > > diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c > index 0d07e6e..37f72e5 100644 > --- a/kernel/rcuclassic.c > +++ b/kernel/rcuclassic.c > @@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > { > } > > -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) > { > } I should have learned by now to test with all relevant config parameter combinations... My apologies for the hassle!!! Thanx, Paul ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU 2008-10-03 8:39 ` Ingo Molnar 2008-10-03 8:42 ` [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix Ingo Molnar @ 2008-10-03 15:13 ` Paul E. McKenney 1 sibling, 0 replies; 17+ messages in thread From: Paul E. McKenney @ 2008-10-03 15:13 UTC (permalink / raw) To: Ingo Molnar Cc: linux-kernel, mathieu.desnoyers, akpm, laijs, manfred, dipankar, niv, dvhltc, josht, ltt-dev On Fri, Oct 03, 2008 at 10:39:47AM +0200, Ingo Molnar wrote: > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote: > > > Hello! > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > which defaults disabled. This is a debugging feature to detect > > infinite loops in kernel code, not something that non-kernel-hackers > > would be expected to care about. This feature can detect looping CPUs > > in !PREEMPT builds and looping CPUs with preemption disabled in > > PREEMPT builds. This is essentially a port of this functionality from > > the treercu patch, replacing the stall debug patch that is already in > > tip/core/rcu (commit 67182ae1c4). > > > > The changes from the patch in tip/core/rcu include making the config > > variable name match that in treercu, changing from seconds to jiffies > > to avoid spurious warnings, and printing a boot message when this > > feature is enabled. > > > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > applied to tip/core/rcu, thanks Paul! > > i tidied up two tiny details noticed by scripts/checkpatch.pl: > > > +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ > > +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ > > adding paranthesis around the expression. > > > + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { > > + > > and have removed the two tailing tabs from this line. <red face> checkpatch.pl... I have heard of it! :-) Thanx, Paul ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 14:05 ` Paul E. McKenney 2008-10-02 15:10 ` Mathieu Desnoyers @ 2008-10-02 17:39 ` Paul E. McKenney 2008-10-03 10:12 ` Lai Jiangshan 1 sibling, 1 reply; 17+ messages in thread From: Paul E. McKenney @ 2008-10-02 17:39 UTC (permalink / raw) To: Ingo Molnar Cc: linux-kernel, akpm, mathieu.desnoyers, laijs, manfred, dipankar, niv, dvhltc, josht Hello again! This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch. This version uses jiffies rather than get_seconds(), which eliminates the spurious boot-time CPU stall warnings seen on some systems with the previous patch. This is still against v2.6.27-rc8 -- I will do a version against tip this evening (Pacific Time) when I get back to the combination of better bandwidth and AC power. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/linux/rcuclassic.h | 9 ++++ kernel/rcuclassic.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 13 ++++++ 3 files changed, 110 insertions(+) diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 4ab8436..cab055b 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -40,6 +40,10 @@ #include <linux/cpumask.h> #include <linux/seqlock.h> +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { @@ -52,6 +56,11 @@ struct rcu_ctrlblk { spinlock_t lock ____cacheline_internodealigned_in_smp; cpumask_t cpumask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started in jiffies. */ + unsigned long jiffies_stall; + /* Time at which to check for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ } ____cacheline_internodealigned_in_smp; /* Is batch a before batch b ? */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index aad93cd..a299876 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -118,6 +118,87 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -285,6 +366,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) */ smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -468,6 +550,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); + /* This cpu has pending rcu entries and the grace period * for them has completed. */ @@ -558,6 +643,9 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { +#ifdef CONFIG_DEBUG_RCU_STALL + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ memset(rdp, 0, sizeof(*rdp)); rdp->curtail = &rdp->curlist; rdp->nxttail = &rdp->nxtlist; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b50481..9fee969 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_CPU_STALL_DETECTOR + bool "Check for stalled CPUs delaying RCU grace periods" + depends on CLASSIC_RCU + default n + help + This option causes RCU to printk information on which + CPUs are delaying the current grace period, but only when + the grace period extends for excessive time periods. + + Say Y if you want RCU to perform such checks. + + Say N if you are unsure. + config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL ^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-02 17:39 ` [PATCH,RFC] " Paul E. McKenney @ 2008-10-03 10:12 ` Lai Jiangshan 2008-10-03 15:12 ` Paul E. McKenney 0 siblings, 1 reply; 17+ messages in thread From: Lai Jiangshan @ 2008-10-03 10:12 UTC (permalink / raw) To: paulmck Cc: Ingo Molnar, linux-kernel, akpm, mathieu.desnoyers, manfred, dipankar, niv, dvhltc, josht I found a magic number in it. Paul E. McKenney wrote: > Hello again! > > This patch adds stalled-CPU detection to Classic RCU. This capability > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > which defaults disabled. This is a debugging feature, not something > that non-kernel-hackers would be expected to care about. This feature > can detect looping CPUs in !PREEMPT builds and looping CPUs with > preemption disabled in PREEMPT builds. This is essentially a port of > this functionality from the treercu patch. > > This version uses jiffies rather than get_seconds(), which eliminates > the spurious boot-time CPU stall warnings seen on some systems with > the previous patch. > > This is still against v2.6.27-rc8 -- I will do a version against tip > this evening (Pacific Time) when I get back to the combination of better > bandwidth and AC power. > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > --- > > include/linux/rcuclassic.h | 9 ++++ > kernel/rcuclassic.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ > lib/Kconfig.debug | 13 ++++++ > 3 files changed, 110 insertions(+) > > diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h > index 4ab8436..cab055b 100644 > --- a/include/linux/rcuclassic.h > +++ b/include/linux/rcuclassic.h > @@ -40,6 +40,10 @@ > #include <linux/cpumask.h> > #include <linux/seqlock.h> > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ > +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > /* Global control variables for rcupdate callback mechanism. */ > struct rcu_ctrlblk { > @@ -52,6 +56,11 @@ struct rcu_ctrlblk { > spinlock_t lock ____cacheline_internodealigned_in_smp; > cpumask_t cpumask; /* CPUs that need to switch in order */ > /* for current batch to proceed. */ > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > + unsigned long gp_start; /* Time at which GP started in jiffies. */ > + unsigned long jiffies_stall; > + /* Time at which to check for CPU stalls. */ > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > } ____cacheline_internodealigned_in_smp; > > /* Is batch a before batch b ? */ > diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c > index aad93cd..a299876 100644 > --- a/kernel/rcuclassic.c > +++ b/kernel/rcuclassic.c > @@ -118,6 +118,87 @@ static inline void force_quiescent_state(struct rcu_data *rdp, > } > #endif > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > + > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > +{ > + rcp->gp_start = jiffies; > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; > +} > + > +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) > +{ > + int cpu; > + long delta; > + unsigned long flags; > + > + /* Only let one CPU complain about others per time interval. */ > + > + spin_lock_irqsave(&rcp->lock, flags); > + delta = jiffies - rcp->jiffies_stall; > + if (delta < 2 || rcp->cur != rcp->completed) { Is it (2 * HZ)? should it be defined as macro? > + spin_unlock_irqrestore(&rcp->lock, flags); > + return; > + } > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > + spin_unlock_irqrestore(&rcp->lock, flags); > + > + /* OK, time to rat on our buddy... */ > + > + printk(KERN_ERR "RCU detected CPU stalls:"); > + for_each_possible_cpu(cpu) { > + if (cpu_isset(cpu, rcp->cpumask)) > + printk(" %d", cpu); > + } > + printk(" (detected by %d, t=%ld jiffies)\n", > + smp_processor_id(), (long)(jiffies - rcp->gp_start)); > +} > + > +static void print_cpu_stall(struct rcu_ctrlblk *rcp) > +{ > + unsigned long flags; > + > + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", > + smp_processor_id(), jiffies, > + jiffies - rcp->gp_start); > + dump_stack(); > + spin_lock_irqsave(&rcp->lock, flags); > + if ((long)(jiffies - rcp->jiffies_stall) >= 0) > + rcp->jiffies_stall = > + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > + spin_unlock_irqrestore(&rcp->lock, flags); > + set_need_resched(); /* kick ourselves to get things going. */ > +} > + > +static void check_cpu_stall(struct rcu_ctrlblk *rcp) > +{ > + long delta; > + > + delta = jiffies - rcp->jiffies_stall; > + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { > + > + /* We haven't checked in, so go dump stack. */ > + print_cpu_stall(rcp); > + > + } else if (rcp->cur != rcp->completed && delta >= 2) { > + > + /* They had two seconds to dump stack, so complain. */ appear here again! and it's inconsistent with comment. > + print_other_cpu_stall(rcp); > + } > +} > + > +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > + > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > +{ > +} > + > +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > +{ > +} > + > +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > + > /** > * call_rcu - Queue an RCU callback for invocation after a grace period. > * @head: structure to be used for queueing the RCU updates. > @@ -285,6 +366,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) > */ > smp_wmb(); > rcp->cur++; > + record_gp_stall_check_time(rcp); > > /* > * Accessing nohz_cpu_mask before incrementing rcp->cur needs a > @@ -468,6 +550,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) > > static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > { > + /* Check for CPU stalls, if enabled. */ > + check_cpu_stall(rcp); > + > /* This cpu has pending rcu entries and the grace period > * for them has completed. > */ > @@ -558,6 +643,9 @@ void rcu_check_callbacks(int cpu, int user) > static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, > struct rcu_data *rdp) > { > +#ifdef CONFIG_DEBUG_RCU_STALL > + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); > +#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ > memset(rdp, 0, sizeof(*rdp)); > rdp->curtail = &rdp->curlist; > rdp->nxttail = &rdp->nxtlist; > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > index 0b50481..9fee969 100644 > --- a/lib/Kconfig.debug > +++ b/lib/Kconfig.debug > @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE > Say N here if you want the RCU torture tests to start only > after being manually enabled via /proc. > > +config RCU_CPU_STALL_DETECTOR > + bool "Check for stalled CPUs delaying RCU grace periods" > + depends on CLASSIC_RCU > + default n > + help > + This option causes RCU to printk information on which > + CPUs are delaying the current grace period, but only when > + the grace period extends for excessive time periods. > + > + Say Y if you want RCU to perform such checks. > + > + Say N if you are unsure. > + > config KPROBES_SANITY_TEST > bool "Kprobes sanity tests" > depends on DEBUG_KERNEL > > > ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU 2008-10-03 10:12 ` Lai Jiangshan @ 2008-10-03 15:12 ` Paul E. McKenney 0 siblings, 0 replies; 17+ messages in thread From: Paul E. McKenney @ 2008-10-03 15:12 UTC (permalink / raw) To: Lai Jiangshan Cc: Ingo Molnar, linux-kernel, akpm, mathieu.desnoyers, manfred, dipankar, niv, dvhltc, josht On Fri, Oct 03, 2008 at 06:12:28PM +0800, Lai Jiangshan wrote: > I found a magic number in it. Good catch, Jiangshan!!! See below for responses, and please see the end of this email for an untested patch. > Paul E. McKenney wrote: > > Hello again! > > > > This patch adds stalled-CPU detection to Classic RCU. This capability > > is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, > > which defaults disabled. This is a debugging feature, not something > > that non-kernel-hackers would be expected to care about. This feature > > can detect looping CPUs in !PREEMPT builds and looping CPUs with > > preemption disabled in PREEMPT builds. This is essentially a port of > > this functionality from the treercu patch. > > > > This version uses jiffies rather than get_seconds(), which eliminates > > the spurious boot-time CPU stall warnings seen on some systems with > > the previous patch. > > > > This is still against v2.6.27-rc8 -- I will do a version against tip > > this evening (Pacific Time) when I get back to the combination of better > > bandwidth and AC power. > > > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > > --- > > > > include/linux/rcuclassic.h | 9 ++++ > > kernel/rcuclassic.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ > > lib/Kconfig.debug | 13 ++++++ > > 3 files changed, 110 insertions(+) > > > > diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h > > index 4ab8436..cab055b 100644 > > --- a/include/linux/rcuclassic.h > > +++ b/include/linux/rcuclassic.h > > @@ -40,6 +40,10 @@ > > #include <linux/cpumask.h> > > #include <linux/seqlock.h> > > > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > +#define RCU_SECONDS_TILL_STALL_CHECK 3 * HZ /* for rcp->jiffies_stall */ > > +#define RCU_SECONDS_TILL_STALL_RECHECK 30 * HZ /* for rcp->jiffies_stall */ > > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > > > /* Global control variables for rcupdate callback mechanism. */ > > struct rcu_ctrlblk { > > @@ -52,6 +56,11 @@ struct rcu_ctrlblk { > > spinlock_t lock ____cacheline_internodealigned_in_smp; > > cpumask_t cpumask; /* CPUs that need to switch in order */ > > /* for current batch to proceed. */ > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > + unsigned long gp_start; /* Time at which GP started in jiffies. */ > > + unsigned long jiffies_stall; > > + /* Time at which to check for CPU stalls. */ > > +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > } ____cacheline_internodealigned_in_smp; > > > > /* Is batch a before batch b ? */ > > diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c > > index aad93cd..a299876 100644 > > --- a/kernel/rcuclassic.c > > +++ b/kernel/rcuclassic.c > > @@ -118,6 +118,87 @@ static inline void force_quiescent_state(struct rcu_data *rdp, > > } > > #endif > > > > +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR > > + > > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > > +{ > > + rcp->gp_start = jiffies; > > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; > > +} > > + > > +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) > > +{ > > + int cpu; > > + long delta; > > + unsigned long flags; > > + > > + /* Only let one CPU complain about others per time interval. */ > > + > > + spin_lock_irqsave(&rcp->lock, flags); > > + delta = jiffies - rcp->jiffies_stall; > > + if (delta < 2 || rcp->cur != rcp->completed) { > > Is it (2 * HZ)? > should it be defined as macro? It should be "2", though a macro might be good. The reason for "2" is that it guarantees that the other CPU had a full period to respond, so it would be "2" regardless of the time units. > > + spin_unlock_irqrestore(&rcp->lock, flags); > > + return; > > + } > > + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > > + spin_unlock_irqrestore(&rcp->lock, flags); > > + > > + /* OK, time to rat on our buddy... */ > > + > > + printk(KERN_ERR "RCU detected CPU stalls:"); > > + for_each_possible_cpu(cpu) { > > + if (cpu_isset(cpu, rcp->cpumask)) > > + printk(" %d", cpu); > > + } > > + printk(" (detected by %d, t=%ld jiffies)\n", > > + smp_processor_id(), (long)(jiffies - rcp->gp_start)); > > +} > > + > > +static void print_cpu_stall(struct rcu_ctrlblk *rcp) > > +{ > > + unsigned long flags; > > + > > + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", > > + smp_processor_id(), jiffies, > > + jiffies - rcp->gp_start); > > + dump_stack(); > > + spin_lock_irqsave(&rcp->lock, flags); > > + if ((long)(jiffies - rcp->jiffies_stall) >= 0) > > + rcp->jiffies_stall = > > + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; > > + spin_unlock_irqrestore(&rcp->lock, flags); > > + set_need_resched(); /* kick ourselves to get things going. */ > > +} > > + > > +static void check_cpu_stall(struct rcu_ctrlblk *rcp) > > +{ > > + long delta; > > + > > + delta = jiffies - rcp->jiffies_stall; > > + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { > > + > > + /* We haven't checked in, so go dump stack. */ > > + print_cpu_stall(rcp); > > + > > + } else if (rcp->cur != rcp->completed && delta >= 2) { > > + > > + /* They had two seconds to dump stack, so complain. */ > > appear here again! and it's inconsistent with comment. Indeed! The comment is wrong. > > + print_other_cpu_stall(rcp); > > + } > > +} > > + > > +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > + > > +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) > > +{ > > +} > > + > > +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > > +{ > > +} > > + > > +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ > > + > > /** > > * call_rcu - Queue an RCU callback for invocation after a grace period. > > * @head: structure to be used for queueing the RCU updates. > > @@ -285,6 +366,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) > > */ > > smp_wmb(); > > rcp->cur++; > > + record_gp_stall_check_time(rcp); > > > > /* > > * Accessing nohz_cpu_mask before incrementing rcp->cur needs a > > @@ -468,6 +550,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) > > > > static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) > > { > > + /* Check for CPU stalls, if enabled. */ > > + check_cpu_stall(rcp); > > + > > /* This cpu has pending rcu entries and the grace period > > * for them has completed. > > */ > > @@ -558,6 +643,9 @@ void rcu_check_callbacks(int cpu, int user) > > static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, > > struct rcu_data *rdp) > > { > > +#ifdef CONFIG_DEBUG_RCU_STALL > > + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); > > +#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ > > memset(rdp, 0, sizeof(*rdp)); > > rdp->curtail = &rdp->curlist; > > rdp->nxttail = &rdp->nxtlist; > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > > index 0b50481..9fee969 100644 > > --- a/lib/Kconfig.debug > > +++ b/lib/Kconfig.debug > > @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE > > Say N here if you want the RCU torture tests to start only > > after being manually enabled via /proc. > > > > +config RCU_CPU_STALL_DETECTOR > > + bool "Check for stalled CPUs delaying RCU grace periods" > > + depends on CLASSIC_RCU > > + default n > > + help > > + This option causes RCU to printk information on which > > + CPUs are delaying the current grace period, but only when > > + the grace period extends for excessive time periods. > > + > > + Say Y if you want RCU to perform such checks. > > + > > + Say N if you are unsure. > > + > > config KPROBES_SANITY_TEST > > bool "Kprobes sanity tests" > > depends on DEBUG_KERNEL Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/linux/rcuclassic.h | 4 ++++ kernel/rcuclassic.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 8388647..69e1929 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -43,6 +43,10 @@ #ifdef CONFIG_RCU_CPU_STALL_DETECTOR #define RCU_SECONDS_TILL_STALL_CHECK (3 * HZ) /* for rcp->jiffies_stall */ #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 0ce0802..8c2d15d 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -136,7 +136,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_lock_irqsave(&rcp->lock, flags); delta = jiffies - rcp->seconds_stall; - if (delta < 2 || rcp->cur != rcp->completed) { + if (delta < RCU_STALL_RAT_DELAY || rcp->cur != rcp->completed) { spin_unlock_irqrestore(&rcp->lock, flags); return; } @@ -180,9 +180,9 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(rcp); - } else if (rcp->cur != rcp->completed && delta >= 2) { + } else if (rcp->cur != rcp->completed && delta >= RCU_STALL_RAT_DELAY) { - /* They had two seconds to dump stack, so complain. */ + /* They had two time units to dump stack, so complain. */ print_other_cpu_stall(rcp); } } ^ permalink raw reply related [flat|nested] 17+ messages in thread
end of thread, other threads:[~2008-10-03 15:14 UTC | newest] Thread overview: 17+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-10-02 0:36 [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 2008-10-02 8:07 ` Ingo Molnar 2008-10-02 14:05 ` Paul E. McKenney 2008-10-02 15:10 ` Mathieu Desnoyers 2008-10-02 17:43 ` Paul E. McKenney 2008-10-02 18:51 ` [PATCH,RFC] RCU-based detection of stalled CPUs for Classic RCU (LTTng support) Mathieu Desnoyers 2008-10-02 22:55 ` Paul E. McKenney 2008-10-02 23:06 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 2008-10-03 4:41 ` Mathieu Desnoyers 2008-10-03 5:02 ` Paul E. McKenney 2008-10-03 8:39 ` Ingo Molnar 2008-10-03 8:42 ` [PATCH] rcu: RCU-based detection of stalled CPUs for Classic RCU, fix Ingo Molnar 2008-10-03 15:14 ` Paul E. McKenney 2008-10-03 15:13 ` [PATCH tip/master] RCU-based detection of stalled CPUs for Classic RCU Paul E. McKenney 2008-10-02 17:39 ` [PATCH,RFC] " Paul E. McKenney 2008-10-03 10:12 ` Lai Jiangshan 2008-10-03 15:12 ` Paul E. McKenney
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox