* [Linux-ia64] 2.4.5 hangs in smp_call_function.
@ 2001-06-07 22:00 Jack Steiner
2001-06-08 18:49 ` Seth, Rohit
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Jack Steiner @ 2001-06-07 22:00 UTC (permalink / raw)
To: linux-ia64
Since upgrading to 2.4.5, we have seen several system hangs
where multiple cpus were spinning in smp_call_function.
The problem appears to be caused by the code in smp_call_function()
that resends an IPI if a timeout expires.
Resending a IPI_CALL_FUNC IPI can cause a cpu to process the "call_func"
request twice and corrupt the "data.finished" count by incrementing the
count twice for one request.
Here is a patch that corrects the problem. I'm not sure what the
correct timeout should be - I increase it from HZ to 400000UL
but more investigation need to be done to determine the optimum
value. Since the "resendIPI" code is not needed with C0 stepping
cpus, I didnt worry too much about the timeout value.
I havent seen any more hangs after applying the patch.
----------------------------------------------------------------------------------
diff -Naur linux_base/arch/ia64/kernel/smp.c linux/arch/ia64/kernel/smp.c
--- linux_base/arch/ia64/kernel/smp.c Thu Jun 7 14:44:07 2001
+++ linux/arch/ia64/kernel/smp.c Thu Jun 7 14:46:05 2001
@@ -244,6 +244,28 @@
send_IPI_single(smp_processor_id(), op);
}
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC) || defined(CONFIG_ITANIUM_PTCG))
+void
+resend_lost_IPI (void)
+{
+ /*
+ * Really need a null IPI but since this rarely should happen & since this code
+ * will go away, lets not add one.
+ */
+ send_IPI_allbutself(IPI_RESCHEDULE);
+}
+
+void
+resend_lost_IPI_single (int cpu)
+{
+ /*
+ * Really need a null IPI but since this rarely should happen & since this code
+ * will go away, lets not add one.
+ */
+ send_IPI_single(cpu, IPI_RESCHEDULE);
+}
+#endif /* CONFIG_ITANIUM_ASTEP_SPECIFIC || CONFIG_ITANIUM_BSTEP_SPECIFIC || CONFIG_ITANIUM_PTCG */
+
void
smp_send_reschedule (int cpu)
{
@@ -258,16 +280,6 @@
send_IPI_allbutself(IPI_FLUSH_TLB);
}
-void
-smp_resend_flush_tlb (void)
-{
- /*
- * Really need a null IPI but since this rarely should happen & since this code
- * will go away, lets not add one.
- */
- send_IPI_allbutself(IPI_RESCHEDULE);
-}
-
#endif /* !CONFIG_ITANIUM_PTCG */
void
@@ -314,16 +326,18 @@
spin_lock_bh(&call_lock);
call_data = &data;
- resend:
send_IPI_single(cpuid, IPI_CALL_FUNC);
#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
/* Wait for response */
- timeout = jiffies + HZ;
+ again:
+ timeout = jiffies + 400000UL;
while ((atomic_read(&data.started) != cpus) && time_before(jiffies, timeout))
barrier();
- if (atomic_read(&data.started) != cpus)
- goto resend;
+ if (atomic_read(&data.started) != cpus) {
+ resend_lost_IPI_single(cpuid);
+ goto again;
+ }
#else
/* Wait for response */
while (atomic_read(&data.started) != cpus)
@@ -379,17 +393,19 @@
spin_lock_bh(&call_lock);
call_data = &data;
- resend:
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_allbutself(IPI_CALL_FUNC);
#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
/* Wait for response */
- timeout = jiffies + HZ;
+ again:
+ timeout = jiffies + 400000UL;
while ((atomic_read(&data.started) != cpus) && time_before(jiffies, timeout))
barrier();
- if (atomic_read(&data.started) != cpus)
- goto resend;
+ if (atomic_read(&data.started) != cpus) {
+ resend_lost_IPI();
+ goto again;
+ }
#else
/* Wait for response */
while (atomic_read(&data.started) != cpus)
diff -Naur linux_base/arch/ia64/mm/tlb.c linux/arch/ia64/mm/tlb.c
--- linux_base/arch/ia64/mm/tlb.c Thu Jun 7 14:44:07 2001
+++ linux/arch/ia64/mm/tlb.c Thu Jun 7 14:46:07 2001
@@ -99,12 +99,12 @@
*/
#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC)
{
- extern void smp_resend_flush_tlb (void);
+ extern void smp_resend_lost_IPI (void);
unsigned long start = ia64_get_itc();
while (atomic_read(&flush_cpu_count) > 0) {
if ((ia64_get_itc() - start) > 400000UL) {
- smp_resend_flush_tlb();
+ smp_resend_lost_IPI();
start = ia64_get_itc();
}
}
--
Thanks
Jack Steiner (651-683-5302) (vnet 233-5302) steiner@sgi.com
^ permalink raw reply [flat|nested] 4+ messages in thread* RE: [Linux-ia64] 2.4.5 hangs in smp_call_function.
2001-06-07 22:00 [Linux-ia64] 2.4.5 hangs in smp_call_function Jack Steiner
@ 2001-06-08 18:49 ` Seth, Rohit
2001-06-08 20:03 ` David Mosberger
2001-06-12 6:37 ` root
2 siblings, 0 replies; 4+ messages in thread
From: Seth, Rohit @ 2001-06-08 18:49 UTC (permalink / raw)
To: linux-ia64
Jack,
The only change that is needed is to change the value of timeout from HZ to
400... The race condition where source processor is getting out of while
(timeout) loop and target processor is just about to send the
acknowledgement (by setting the started field) will most likely not happen
for such a big timeout. And as this code is only for earlier stepping so it
should be okay. Though I think that HZ itself is big enough but then seems
like your system is under severe load and taking longer to respond for IPIs.
Besides that the other changes are not needed. In fact there are couple of
things that you should not do 1: don't include resend_list_IPI even when
CONFIG_ITANIUM_PTCG is defined 2: don't just continue to be in while loop
without resending the IPI again for the cases when you really have a lost
IPI.
rohit
-----Original Message-----
From: Jack Steiner [mailto:steiner@sgi.com]
Sent: Thursday, June 07, 2001 3:00 PM
To: linux-ia64@linuxia64.org
Subject: [Linux-ia64] 2.4.5 hangs in smp_call_function.
Since upgrading to 2.4.5, we have seen several system hangs
where multiple cpus were spinning in smp_call_function.
The problem appears to be caused by the code in smp_call_function()
that resends an IPI if a timeout expires.
Resending a IPI_CALL_FUNC IPI can cause a cpu to process the "call_func"
request twice and corrupt the "data.finished" count by incrementing the
count twice for one request.
Here is a patch that corrects the problem. I'm not sure what the
correct timeout should be - I increase it from HZ to 400000UL
but more investigation need to be done to determine the optimum
value. Since the "resendIPI" code is not needed with C0 stepping
cpus, I didnt worry too much about the timeout value.
I havent seen any more hangs after applying the patch.
----------------------------------------------------------------------------
------
diff -Naur linux_base/arch/ia64/kernel/smp.c linux/arch/ia64/kernel/smp.c
--- linux_base/arch/ia64/kernel/smp.c Thu Jun 7 14:44:07 2001
+++ linux/arch/ia64/kernel/smp.c Thu Jun 7 14:46:05 2001
@@ -244,6 +244,28 @@
send_IPI_single(smp_processor_id(), op);
}
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) ||
defined(CONFIG_ITANIUM_BSTEP_SPECIFIC) || defined(CONFIG_ITANIUM_PTCG))
+void
+resend_lost_IPI (void)
+{
+ /*
+ * Really need a null IPI but since this rarely should happen &
since this code
+ * will go away, lets not add one.
+ */
+ send_IPI_allbutself(IPI_RESCHEDULE);
+}
+
+void
+resend_lost_IPI_single (int cpu)
+{
+ /*
+ * Really need a null IPI but since this rarely should happen &
since this code
+ * will go away, lets not add one.
+ */
+ send_IPI_single(cpu, IPI_RESCHEDULE);
+}
+#endif /* CONFIG_ITANIUM_ASTEP_SPECIFIC || CONFIG_ITANIUM_BSTEP_SPECIFIC ||
CONFIG_ITANIUM_PTCG */
+
void
smp_send_reschedule (int cpu)
{
@@ -258,16 +280,6 @@
send_IPI_allbutself(IPI_FLUSH_TLB);
}
-void
-smp_resend_flush_tlb (void)
-{
- /*
- * Really need a null IPI but since this rarely should happen &
since this code
- * will go away, lets not add one.
- */
- send_IPI_allbutself(IPI_RESCHEDULE);
-}
-
#endif /* !CONFIG_ITANIUM_PTCG */
void
@@ -314,16 +326,18 @@
spin_lock_bh(&call_lock);
call_data = &data;
- resend:
send_IPI_single(cpuid, IPI_CALL_FUNC);
#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) ||
defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
/* Wait for response */
- timeout = jiffies + HZ;
+ again:
+ timeout = jiffies + 400000UL;
while ((atomic_read(&data.started) != cpus) && time_before(jiffies,
timeout))
barrier();
- if (atomic_read(&data.started) != cpus)
- goto resend;
+ if (atomic_read(&data.started) != cpus) {
+ resend_lost_IPI_single(cpuid);
+ goto again;
+ }
#else
/* Wait for response */
while (atomic_read(&data.started) != cpus)
@@ -379,17 +393,19 @@
spin_lock_bh(&call_lock);
call_data = &data;
- resend:
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_allbutself(IPI_CALL_FUNC);
#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) ||
defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
/* Wait for response */
- timeout = jiffies + HZ;
+ again:
+ timeout = jiffies + 400000UL;
while ((atomic_read(&data.started) != cpus) && time_before(jiffies,
timeout))
barrier();
- if (atomic_read(&data.started) != cpus)
- goto resend;
+ if (atomic_read(&data.started) != cpus) {
+ resend_lost_IPI();
+ goto again;
+ }
#else
/* Wait for response */
while (atomic_read(&data.started) != cpus)
diff -Naur linux_base/arch/ia64/mm/tlb.c linux/arch/ia64/mm/tlb.c
--- linux_base/arch/ia64/mm/tlb.c Thu Jun 7 14:44:07 2001
+++ linux/arch/ia64/mm/tlb.c Thu Jun 7 14:46:07 2001
@@ -99,12 +99,12 @@
*/
#if defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) ||
defined(CONFIG_ITANIUM_BSTEP_SPECIFIC)
{
- extern void smp_resend_flush_tlb (void);
+ extern void smp_resend_lost_IPI (void);
unsigned long start = ia64_get_itc();
while (atomic_read(&flush_cpu_count) > 0) {
if ((ia64_get_itc() - start) > 400000UL) {
- smp_resend_flush_tlb();
+ smp_resend_lost_IPI();
start = ia64_get_itc();
}
}
--
Thanks
Jack Steiner (651-683-5302) (vnet 233-5302) steiner@sgi.com
_______________________________________________
Linux-IA64 mailing list
Linux-IA64@linuxia64.org
http://lists.linuxia64.org/lists/listinfo/linux-ia64
^ permalink raw reply [flat|nested] 4+ messages in thread* RE: [Linux-ia64] 2.4.5 hangs in smp_call_function.
2001-06-07 22:00 [Linux-ia64] 2.4.5 hangs in smp_call_function Jack Steiner
2001-06-08 18:49 ` Seth, Rohit
@ 2001-06-08 20:03 ` David Mosberger
2001-06-12 6:37 ` root
2 siblings, 0 replies; 4+ messages in thread
From: David Mosberger @ 2001-06-08 20:03 UTC (permalink / raw)
To: linux-ia64
Actually, I'd like to remove support for anything older than B3 in the
not too distant future. I can leave the code there for a while
longer, but I'm considering it unsupported. Also, note that this
means that if you find a bug that shows up only on pre-B3 systems, I
don't care about it and don't want to hear about it... ;-)
--david
>>>>> On Fri, 8 Jun 2001 11:49:55 -0700, "Seth, Rohit" <rohit.seth@intel.com> said:
Rohit> Jack, The only change that is needed is to change the value
Rohit> of timeout from HZ to 400... The race condition where source
Rohit> processor is getting out of while (timeout) loop and target
Rohit> processor is just about to send the acknowledgement (by
Rohit> setting the started field) will most likely not happen for
Rohit> such a big timeout. And as this code is only for earlier
Rohit> stepping so it should be okay. Though I think that HZ itself
Rohit> is big enough but then seems like your system is under severe
Rohit> load and taking longer to respond for IPIs.
Rohit> Besides that the other changes are not needed. In fact there
Rohit> are couple of things that you should not do 1: don't include
Rohit> resend_list_IPI even when CONFIG_ITANIUM_PTCG is defined 2:
Rohit> don't just continue to be in while loop without resending the
Rohit> IPI again for the cases when you really have a lost IPI.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [Linux-ia64] 2.4.5 hangs in smp_call_function.
2001-06-07 22:00 [Linux-ia64] 2.4.5 hangs in smp_call_function Jack Steiner
2001-06-08 18:49 ` Seth, Rohit
2001-06-08 20:03 ` David Mosberger
@ 2001-06-12 6:37 ` root
2 siblings, 0 replies; 4+ messages in thread
From: root @ 2001-06-12 6:37 UTC (permalink / raw)
To: linux-ia64
>>>>> On Thu, 7 Jun 2001 17:00:11 -0500 (CDT), Jack Steiner <steiner@sgi.com> said:
Jack> Since upgrading to 2.4.5, we have seen several system hangs
Jack> where multiple cpus were spinning in smp_call_function.
Jack> The problem appears to be caused by the code in
Jack> smp_call_function() that resends an IPI if a timeout expires.
It turns out that the real problem here was that the IPI timeout
mechanism was enabled even for B3 step CPUs. The patch below fixes
this. Thanks to Jack for tracking this down.
--david
--- arch/ia64/kernel/smp.c~ Tue Jun 5 10:18:34 2001
+++ arch/ia64/kernel/smp.c Mon Jun 11 15:29:40 2001
@@ -285,7 +285,8 @@
{
struct call_data_struct data;
int cpus = 1;
-#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC) \
+ || defined(CONFIG_ITANIUM_B1_SPECIFIC) || defined(CONFIG_ITANIUM_B2_SPECIFIC))
unsigned long timeout;
#endif
@@ -307,7 +308,8 @@
resend:
send_IPI_single(cpuid, IPI_CALL_FUNC);
-#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC) \
+ || defined(CONFIG_ITANIUM_B1_SPECIFIC) || defined(CONFIG_ITANIUM_B2_SPECIFIC))
/* Wait for response */
timeout = jiffies + HZ;
while ((atomic_read(&data.started) != cpus) && time_before(jiffies, timeout))
@@ -352,7 +354,8 @@
{
struct call_data_struct data;
int cpus = smp_num_cpus-1;
-#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC) \
+ || defined(CONFIG_ITANIUM_B1_SPECIFIC) || defined(CONFIG_ITANIUM_B2_SPECIFIC))
unsigned long timeout;
#endif
@@ -373,7 +376,8 @@
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_allbutself(IPI_CALL_FUNC);
-#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_BSTEP_SPECIFIC))
+#if (defined(CONFIG_ITANIUM_ASTEP_SPECIFIC) || defined(CONFIG_ITANIUM_B0_SPECIFIC) \
+ || defined(CONFIG_ITANIUM_B1_SPECIFIC) || defined(CONFIG_ITANIUM_B2_SPECIFIC))
/* Wait for response */
timeout = jiffies + HZ;
while ((atomic_read(&data.started) != cpus) && time_before(jiffies, timeout))
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2001-06-12 6:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2001-06-07 22:00 [Linux-ia64] 2.4.5 hangs in smp_call_function Jack Steiner
2001-06-08 18:49 ` Seth, Rohit
2001-06-08 20:03 ` David Mosberger
2001-06-12 6:37 ` root
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox