dcache BUG()

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* dcache BUG()
@ 2001-05-07 17:21 Brian Kuschak
  2001-05-07 20:58 ` Dan Malek
  0 siblings, 1 reply; 57+ messages in thread
From: Brian Kuschak @ 2001-05-07 17:21 UTC (permalink / raw)
  To: 'linuxppc-embedded@lists.linuxppc.org'

I posted this to message linux-kernel too, but maybe someone here has seen
this, or can duplicate this problem...

Running snmpd or httpd overnight causes this oops: (kernel BUG at
/home/brian/linux/include/linux/dcache.h:251! - in dget() called from
d_alloc()).  Occasionally I see: de_put: entry net already free! before the
oops.

I've been able to reliably reproduce the problem in 15 minutes by running
this instead:
while /bin/true; do cat /proc/net/* 2>/dev/null > /tmp/junk; done;

The dget() fails when trying to open /proc/net/tcp, for example, and finds
that net has a zero dentry->d_count.

Montavista 2.4.2, patched to 2.4.3 on a 405GP, with root fs on ramdisk. Any
ideas on why this is happening? The system is stable otherwise.

Thanks, Brian

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-07 19:04 Eli Chen
  2001-05-07 21:04 ` Dan Malek
  2001-05-07 21:17 ` Dan Malek
  0 siblings, 2 replies; 57+ messages in thread
From: Eli Chen @ 2001-05-07 19:04 UTC (permalink / raw)
  To: brian.kuschak; +Cc: linuxppc-embedded


Brian,

I have also seen the dcache BUG, as well as bugs and warnings from other
parts of the kernel in the MontaVista 2.4.0 kernel.  They all seem to be
related to a inconsistency with reference counters, which led me to suspect
a problem with atomic instructions in our kernel.  I have replaced the
lwarx/stcrx pairs in include/asm-ppc/atomic.h with code that just turns off
and on interrupts, and that seemed to have made the error messages and BUGs
disappear.  Please try this patch and see if you still have the same
problems.  This is really just a work around for us until we find out what
is the real problem.

thanks,
Eli

diff -c -r1.1.1.2 atomic.h
*** atomic.h 2001/02/21 00:53:16 1.1.1.2
--- atomic.h 2001/05/07 18:35:10
***************
*** 5,10 ****
--- 5,25 ----
  #ifndef _ASM_PPC_ATOMIC_H_
  #define _ASM_PPC_ATOMIC_H_

+ struct int_control_struct
+ {
+         void (*int_cli)(void);
+         void (*int_sti)(void);
+         void (*int_restore_flags)(unsigned long);
+         void (*int_save_flags)(unsigned long *);
+         void (*int_set_lost)(unsigned long);
+ };
+
+ extern struct int_control_struct int_control;
+ #define __cli() int_control.int_cli()
+ #define __sti() int_control.int_sti()
+ #define __save_flags(flags) int_control.int_save_flags((unsigned long
*)&flags)
+ #define __restore_flags(flags) int_control.int_restore_flags((unsigned
long)flags)
+
  typedef struct { volatile int counter; } atomic_t;

  #define ATOMIC_INIT(i) { (i) }
***************
*** 17,80 ****

  static __inline__ int atomic_add_return(int a, atomic_t *v)
  {
!  int t;

!  __asm__ __volatile__("\n\
! 1: lwarx %0,0,%3\n\
!  add %0,%2,%0\n\
!  stwcx. %0,0,%3\n\
!  bne- 1b"
!  : "=&r" (t), "=m" (v->counter)
!  : "r" (a), "r" (v), "m" (v->counter)
!  : "cc");

   return t;
  }

  static __inline__ int atomic_sub_return(int a, atomic_t *v)
  {
!  int t;

!  __asm__ __volatile__("\n\
! 1: lwarx %0,0,%3\n\
!  subf %0,%2,%0\n\
!  stwcx. %0,0,%3\n\
!  bne- 1b"
!  : "=&r" (t), "=m" (v->counter)
!  : "r" (a), "r" (v), "m" (v->counter)
!  : "cc");

   return t;
  }

  static __inline__ int atomic_inc_return(atomic_t *v)
  {
!  int t;

!  __asm__ __volatile__("\n\
! 1: lwarx %0,0,%2\n\
!  addic %0,%0,1\n\
!  stwcx. %0,0,%2\n\
!  bne- 1b"
!  : "=&r" (t), "=m" (v->counter)
!  : "r" (v), "m" (v->counter)
!  : "cc");

   return t;
  }

  static __inline__ int atomic_dec_return(atomic_t *v)
  {
!  int t;

!  __asm__ __volatile__("\n\
! 1: lwarx %0,0,%2\n\
!  addic %0,%0,-1\n\
!  stwcx. %0,0,%2\n\
!  bne 1b"
!  : "=&r" (t), "=m" (v->counter)
!  : "r" (v), "m" (v->counter)
!  : "cc");

   return t;
  }
--- 32,91 ----

  static __inline__ int atomic_add_return(int a, atomic_t *v)
  {
!  int flags, t;
!
!  __save_flags(flags);__cli();

!  t = (v)->counter;
!  t = t + a;
!  (v)->counter = t;

+  __restore_flags(flags);
+
   return t;
  }

  static __inline__ int atomic_sub_return(int a, atomic_t *v)
  {
!  int flags, t;
!
!  __save_flags(flags);__cli();

!  t = (v)->counter;
!  t = t - a;
!  (v)->counter = t;

+  __restore_flags(flags);
+
   return t;
  }

  static __inline__ int atomic_inc_return(atomic_t *v)
  {
!  int flags, t;
!
!  __save_flags(flags);__cli();
!
!  t = (v)->counter;
!  t = t + 1;
!  (v)->counter = t;

!  __restore_flags(flags);

   return t;
  }

  static __inline__ int atomic_dec_return(atomic_t *v)
  {
!  int flags, t;
!
!  __save_flags(flags);__cli();
!
!  t = (v)->counter;
!  t = t - 1;
!  (v)->counter = t;

!  __restore_flags(flags);

   return t;
  }
diff -c -r1.1.1.2 hw_irq.h
*** hw_irq.h 2001/02/21 00:53:16 1.1.1.2
--- hw_irq.h 2001/05/07 18:35:27
***************
*** 7,12 ****
--- 7,13 ----
  #ifndef _PPC_HW_IRQ_H
  #define _PPC_HW_IRQ_H

+ #ifndef _ASM_PPC_ATOMIC_H_
  struct int_control_struct
  {
   void (*int_cli)(void);
***************
*** 15,20 ****
--- 16,23 ----
   void (*int_save_flags)(unsigned long *);
   void (*int_set_lost)(unsigned long);
  };
+ #endif
+
  extern struct int_control_struct int_control;
  extern unsigned long timer_interrupt_intercept;
  extern unsigned long do_IRQ_intercept;


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 17:21 Brian Kuschak
@ 2001-05-07 20:58 ` Dan Malek
  0 siblings, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 20:58 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: 'linuxppc-embedded@lists.linuxppc.org'

Brian Kuschak wrote:

> Montavista 2.4.2, patched to 2.4.3 on a 405GP,

Ummm...Just what is a "MontaVista 2.4.2" kernel, and what did
you do to "patched to 2.4.3"?

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 19:04 dcache BUG() Eli Chen
@ 2001-05-07 21:04 ` Dan Malek
  2001-05-07 21:17 ` Dan Malek
  1 sibling, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 21:04 UTC (permalink / raw)
  To: Eli Chen; +Cc: brian.kuschak, linuxppc-embedded

Eli Chen wrote:

> I have also seen the dcache BUG, as well as bugs and warnings from other
> parts of the kernel in the MontaVista 2.4.0 kernel.

Again, I don't know what a "MontaVista 2.4.0" kernel would be.
MontaVista clearly names our software distributions and releases
them on a CD after a QA process.  If you can duplicate this
problem with the software that is on the CD, and then use the
proper reference name, we could all use the same baseline.  Of
course, it would be nicer if you would have purchased the subscription
with the CD and called your dedicated technical representative,
but we'll provide the best free help available now :-).

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 19:04 dcache BUG() Eli Chen
  2001-05-07 21:04 ` Dan Malek
@ 2001-05-07 21:17 ` Dan Malek
  2001-05-07 21:30   ` Tom Rini
  2001-05-07 21:47   ` Eli Chen
  1 sibling, 2 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 21:17 UTC (permalink / raw)
  To: Eli Chen; +Cc: brian.kuschak, linuxppc-embedded

Heh....the best free help wasn't available when I wrote that last
message, so I guess I'll take a stab at an answer :-).

Eli Chen wrote:

> ..........  They all seem to be
> related to a inconsistency with reference counters, which led me to suspect
> a problem with atomic instructions in our kernel.

That's an interesting piece of information......

There have been problems with the directory entry counters in
older 2.3.99/2.4 kernels.  The 4xx development has kind of been on
a planet all by it's lonesome for a long time, and I am now trying
to bring it back into the mainstream.  It could very well be there
were some generic kernel bug fixes that were missed in the 4xx
kernel.

> ..... I have replaced the
> lwarx/stcrx pairs in include/asm-ppc/atomic.h with code that just turns off
> and on interrupts,

What version of silicon do you have, and what platform are you using?

> ...  This is really just a work around for us until we find out what
> is the real problem.

Is there some simple test I can use to trigger this problem?  It
would be nice if you could try a "newer" kernel from FSM Labs.  This
was originally in the linuxppc_2_5 tree, and we are merging/changing
trees at the moment.  I don't remember the URL......TOM, can you
provide some insight?

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 21:17 ` Dan Malek
@ 2001-05-07 21:30   ` Tom Rini
  2001-05-07 23:03     ` Dan Malek
  2001-05-07 21:47   ` Eli Chen
  1 sibling, 1 reply; 57+ messages in thread
From: Tom Rini @ 2001-05-07 21:30 UTC (permalink / raw)
  To: Dan Malek; +Cc: Eli Chen, brian.kuschak, linuxppc-embedded

On Mon, May 07, 2001 at 05:17:16PM -0400, Dan Malek wrote:

> > ...  This is really just a work around for us until we find out what
> > is the real problem.
>
> Is there some simple test I can use to trigger this problem?  It
> would be nice if you could try a "newer" kernel from FSM Labs.  This
> was originally in the linuxppc_2_5 tree, and we are merging/changing
> trees at the moment.  I don't remember the URL......TOM, can you
> provide some insight?

Er, 2_5 isn't unavailable yet.  bk://source.mvista.com/linuxppc_2_5 (mirror)
or bk://bitkeeper.fsmlabs.com:5005 (master).  But 2_5 is 'dead' in the sense
that stuff is being moved to a new tree.  The new tree should be a lot more
populated (with new 6xx/7xx/74xx, 8xx, 82xx and some 4xx) stuff after my
finals (And I've got 2 tomorrow, so adios. :))

--
Tom Rini (TR1265)
http://gate.crashing.org/~trini/

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 21:17 ` Dan Malek
  2001-05-07 21:30   ` Tom Rini
@ 2001-05-07 21:47   ` Eli Chen
  2001-05-07 23:01     ` Dan Malek
  2001-05-07 23:06     ` Gabriel Paubert
  1 sibling, 2 replies; 57+ messages in thread
From: Eli Chen @ 2001-05-07 21:47 UTC (permalink / raw)
  To: Dan Malek; +Cc: brian.kuschak, linuxppc-embedded

> What version of silicon do you have, and what platform are you using?

I am using the 405GP core, rev D.  My tree is based off of the February 26th
source from MontaVista.

> Is there some simple test I can use to trigger this problem?

Besides Brian's one-liner test, you can try flood pinging your 405GP.  I
have been consistently receiving these error messages after letting it run a
while:

Freeing alive device (cxxxxxxx), ethx

and

Attempt to release alive inet socket cxxxxxxx

I have also occasionaly received other messages, which I have yet to receive
after changing atomic.h.

Eli

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-07 22:19 Brian Kuschak
  2001-05-07 22:35 ` Cort Dougan
  2001-05-07 22:43 ` Eli Chen
  0 siblings, 2 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-07 22:19 UTC (permalink / raw)
  To: 'Dan Malek', Eli Chen; +Cc: linuxppc-embedded

Dan,

Just to be clear, we do have several active subscriptions with MontaVista,
and have contacted them numerous times in the past month about this issue.
Unfortunately they were unable to provide a solution.  This kernel was the
latest that MontaVista had for the 4xx as of 3/7/2001.  This was the 2.4.2
kernel.  I applied the 2.4.3 patch to bring it up to date with the latest
bugfixes.  The version as report in /proc/version:  Linux version
2.4.3-mvista_010303.

I looked at the atomic.h code and think I found one bug, however it doesn't
solve my problem.  See if this patch makes sense to you.  Looks like a typo
when the atomic_* functions were converted from assembler to inline.  The
bne should be 'bne-' for the atomic decrement operation.

Regards,
Brian

*** atomic.h    2001/02/26 19:07:19     1.1.1.2
--- atomic.h    2001/05/07 21:13:54
***************
*** 86,94 ****

        __asm__ __volatile__("\n\
  1:    lwarx   %0,0,%2\n\
        addic   %0,%0,-1\n\
        stwcx.  %0,0,%2\n\
!       bne     1b"
        : "=&r" (t), "=m" (v->counter)
        : "r" (v), "m" (v->counter)
        : "cc");
--- 88,96 ----

        __asm__ __volatile__("\n\
  1:    lwarx   %0,0,%2\n\
        addic   %0,%0,-1\n\
        stwcx.  %0,0,%2\n\
!       bne-    1b"
        : "=&r" (t), "=m" (v->counter)
        : "r" (v), "m" (v->counter)
        : "cc");

-----Original Message-----
From: Dan Malek [mailto:dan@mvista.com]
Sent: Monday, May 07, 2001 2:05 PM
To: Eli Chen
Cc: Brian Kuschak; linuxppc-embedded@lists.linuxppc.org
Subject: Re: dcache BUG()

Eli Chen wrote:

> I have also seen the dcache BUG, as well as bugs and warnings from other
> parts of the kernel in the MontaVista 2.4.0 kernel.

Again, I don't know what a "MontaVista 2.4.0" kernel would be.
MontaVista clearly names our software distributions and releases
them on a CD after a QA process.  If you can duplicate this
problem with the software that is on the CD, and then use the
proper reference name, we could all use the same baseline.  Of
course, it would be nicer if you would have purchased the subscription
with the CD and called your dedicated technical representative,
but we'll provide the best free help available now :-).

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 22:19 Brian Kuschak
@ 2001-05-07 22:35 ` Cort Dougan
  2001-05-07 22:43 ` Eli Chen
  1 sibling, 0 replies; 57+ messages in thread
From: Cort Dougan @ 2001-05-07 22:35 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: 'Dan Malek', Eli Chen, linuxppc-embedded


Can I get a tar file of your kernel tree that exhibits this problem?

} Just to be clear, we do have several active subscriptions with MontaVista,
} and have contacted them numerous times in the past month about this issue.
} Unfortunately they were unable to provide a solution.  This kernel was the
} latest that MontaVista had for the 4xx as of 3/7/2001.  This was the 2.4.2
} kernel.  I applied the 2.4.3 patch to bring it up to date with the latest
} bugfixes.  The version as report in /proc/version:  Linux version
} 2.4.3-mvista_010303.
}
} I looked at the atomic.h code and think I found one bug, however it doesn't
} solve my problem.  See if this patch makes sense to you.  Looks like a typo
} when the atomic_* functions were converted from assembler to inline.  The
} bne should be 'bne-' for the atomic decrement operation.
}
} Regards,
} Brian
}
} *** atomic.h    2001/02/26 19:07:19     1.1.1.2
} --- atomic.h    2001/05/07 21:13:54
} ***************
} *** 86,94 ****
}
}         __asm__ __volatile__("\n\
}   1:    lwarx   %0,0,%2\n\
}         addic   %0,%0,-1\n\
}         stwcx.  %0,0,%2\n\
} !       bne     1b"
}         : "=&r" (t), "=m" (v->counter)
}         : "r" (v), "m" (v->counter)
}         : "cc");
} --- 88,96 ----
}
}         __asm__ __volatile__("\n\
}   1:    lwarx   %0,0,%2\n\
}         addic   %0,%0,-1\n\
}         stwcx.  %0,0,%2\n\
} !       bne-    1b"
}         : "=&r" (t), "=m" (v->counter)
}         : "r" (v), "m" (v->counter)
}         : "cc");
}
}
}
} -----Original Message-----
} From: Dan Malek [mailto:dan@mvista.com]
} Sent: Monday, May 07, 2001 2:05 PM
} To: Eli Chen
} Cc: Brian Kuschak; linuxppc-embedded@lists.linuxppc.org
} Subject: Re: dcache BUG()
}
}
} Eli Chen wrote:
}
} > I have also seen the dcache BUG, as well as bugs and warnings from other
} > parts of the kernel in the MontaVista 2.4.0 kernel.
}
} Again, I don't know what a "MontaVista 2.4.0" kernel would be.
} MontaVista clearly names our software distributions and releases
} them on a CD after a QA process.  If you can duplicate this
} problem with the software that is on the CD, and then use the
} proper reference name, we could all use the same baseline.  Of
} course, it would be nicer if you would have purchased the subscription
} with the CD and called your dedicated technical representative,
} but we'll provide the best free help available now :-).
}
}
}
} 	-- Dan
}

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 22:19 Brian Kuschak
  2001-05-07 22:35 ` Cort Dougan
@ 2001-05-07 22:43 ` Eli Chen
  1 sibling, 0 replies; 57+ messages in thread
From: Eli Chen @ 2001-05-07 22:43 UTC (permalink / raw)
  To: Brian Kuschak, 'Dan Malek'; +Cc: linuxppc-embedded

> I looked at the atomic.h code and think I found one bug, however it
doesn't
> solve my problem.  See if this patch makes sense to you.  Looks like a
typo
> when the atomic_* functions were converted from assembler to inline.  The
> bne should be 'bne-' for the atomic decrement operation.

The "-" after bne is just a hint for the branch prediction mechanism I
believe (the 5th bit in the BO operand).  I couldn't figure out why every
other atomic_* function besides atomic decrement has "bne-", but that
shouldn't affect the behavior.  I have also tried changing it to "bne-", but
it has no effect.

Eli

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 21:47   ` Eli Chen
@ 2001-05-07 23:01     ` Dan Malek
  2001-05-07 23:06     ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 23:01 UTC (permalink / raw)
  To: Eli Chen; +Cc: brian.kuschak, linuxppc-embedded

Eli Chen wrote:

> I have also occasionaly received other messages, which I have yet to receive
> after changing atomic.h.

I dunno..........The code can't be broken because it is used
everyday on bazillions of processors.....I can't believe these
instructions are "broken" on the 4xx because that would be too
obvious as well.  I suspect all you are doing is changing some
code timing and masking the real problem.  I'm not saying we
shouldn't fix it, just that it isn't likely the problem is here.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-07 23:01 Brian Kuschak
  0 siblings, 0 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-07 23:01 UTC (permalink / raw)
  To: 'Dan Malek'; +Cc: linuxppc-embedded

If I use the 'old' atomic functions from arch/ppc/kernel/misc.S instead of
the inline functions in atomic.h, the problem doesn't seem to happen.
Normally it happens in about 15 minutes, but I've been running over an hour
now and haven't seen it.

Perhaps there is something else wrong with the __asm__ macros in atomic.h ??

-Brian

-----Original Message-----
From: Eli Chen [mailto:eli@routefree.com]
Sent: Monday, May 07, 2001 3:43 PM
To: Brian Kuschak; 'Dan Malek'
Cc: linuxppc-embedded@lists.linuxppc.org
Subject: Re: dcache BUG()

> I looked at the atomic.h code and think I found one bug, however it
doesn't
> solve my problem.  See if this patch makes sense to you.  Looks like a
typo
> when the atomic_* functions were converted from assembler to inline.  The
> bne should be 'bne-' for the atomic decrement operation.

The "-" after bne is just a hint for the branch prediction mechanism I
believe (the 5th bit in the BO operand).  I couldn't figure out why every
other atomic_* function besides atomic decrement has "bne-", but that
shouldn't affect the behavior.  I have also tried changing it to "bne-", but
it has no effect.

Eli

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 21:30   ` Tom Rini
@ 2001-05-07 23:03     ` Dan Malek
  0 siblings, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 23:03 UTC (permalink / raw)
  To: Tom Rini; +Cc: Eli Chen, brian.kuschak, linuxppc-embedded

Tom Rini wrote:

> Er, 2_5 isn't unavailable yet.

Cool, thanks....If you guys could just give that one a test drive,
it would be helpful.  I've lost my Rev. D for a few days on another
project, but will start testing again soon.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 21:47   ` Eli Chen
  2001-05-07 23:01     ` Dan Malek
@ 2001-05-07 23:06     ` Gabriel Paubert
  2001-05-07 23:15       ` Dan Malek
  1 sibling, 1 reply; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-07 23:06 UTC (permalink / raw)
  To: Eli Chen; +Cc: Dan Malek, brian.kuschak, linuxppc-embedded

On Mon, 7 May 2001, Eli Chen wrote:

>
> > What version of silicon do you have, and what platform are you using?
>
> I am using the 405GP core, rev D.  My tree is based off of the February 26th
> source from MontaVista.
>
> > Is there some simple test I can use to trigger this problem?
>
> Besides Brian's one-liner test, you can try flood pinging your 405GP.  I
> have been consistently receiving these error messages after letting it run a
> while:
>
> Freeing alive device (cxxxxxxx), ethx
>
> and
>
> Attempt to release alive inet socket cxxxxxxx
>
> I have also occasionaly received other messages, which I have yet to receive
> after changing atomic.h.

Hmm, consider what happens if a down_trylock in an interrupt handler
fails. Actually dec_if_positive will leave a dangling reservation, since
it will skip the stwcx. instruction.

I had not looked at the code for very long, so I may misss something or be
completely wrong but I see a stwcx. instruction in transfer_to_handler
which I think is useless since the handler will always execute a lwarx
before attempting a stwcx., thereby making the state of the reservation at
the interrupt enty irrelevant.

On the other hand, when an interrupt handler dows a down_trylock and
then returns because it failed, it will leave the reservation active
until returing to the caller (once again if I did not miss anything in the
return path).

So the sequence of events which can cause corruption is the following:

1) lwarx atomic_var,
   reservation set
2) interrrupt taken,
   reservation set
3) stwcx. in interrupt prologue (transfer_to_handler),
    reservation cleared
4) interrupt handler executes, talks to hardware
5) interupt handler modifies atomic_var,
   reservation set and cleared (hence step 3 was not necessary)
6) down_trylock() fails,
   reservation set
8) interrupt handler returns,
   reservation still set
9) interrupt epilogue restores state and returns between lwarx and stwcx.,
   reservation is still set!
10) swtcx. atomic_var, succeeds, but the variable has been modified in the
   meantime, chaos ensues

In short, I think that step 3) should be moved to the epilogue(s),
ret_from_intercept, etc... Note that spin_trylock() could produce the same
effect in step 6), but it's SMP only.

What do you think, am I completely off base ?

I try to avoid looking at entry.S/head.S/misc.S and had not done it for a
long time since I think it's dangerous for my mental health, so, once
again, I might be completely wrong.

So I'd suggest the following one liner:

===== arch/ppc/kernel/entry.S 1.7 vs edited =====
--- 1.7/arch/ppc/kernel/entry.S	Fri Apr 13 20:44:42 2001
+++ edited/arch/ppc/kernel/entry.S	Tue May  8 01:02:47 2001
@@ -382,6 +382,7 @@
 	CLR_TOP32(r8)
 	mtspr	SPRG2,r8		/* phys exception stack pointer */
 1:
+	stwcx.	r0,0,r1			/* Clear reservation - Gabriel. */
 	lwz	r3,_CTR(r1)
 	lwz	r0,_LINK(r1)
 	mtctr	r3

You can also try to remove the

	li r22,RESULT
	stwcx. r22,r22,r21

or similar lines in head.S, head_4xx.S, and head_8xx.S to check that my
theory is correct.

	Regards,
	Gabriel.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:06     ` Gabriel Paubert
@ 2001-05-07 23:15       ` Dan Malek
  2001-05-07 23:28         ` Gabriel Paubert
  2001-05-07 23:35         ` Eli Chen
  0 siblings, 2 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 23:15 UTC (permalink / raw)
  To: Gabriel Paubert; +Cc: Eli Chen, brian.kuschak, linuxppc-embedded


Gabriel Paubert wrote:

> 10) swtcx. atomic_var, succeeds, but the variable has been modified in the
>    meantime, chaos ensues

How can this happen?  The reservation for the lwarx in 1) has
long been broken, so this swtcx. will fail.....


	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:15       ` Dan Malek
@ 2001-05-07 23:28         ` Gabriel Paubert
  2001-05-07 23:35         ` Eli Chen
  1 sibling, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-07 23:28 UTC (permalink / raw)
  To: Dan Malek; +Cc: Eli Chen, brian.kuschak, linuxppc-embedded

On Mon, 7 May 2001, Dan Malek wrote:

> Gabriel Paubert wrote:
>
> > 10) swtcx. atomic_var, succeeds, but the variable has been modified in the
> >    meantime, chaos ensues
>
> How can this happen?  The reservation for the lwarx in 1) has
> long been broken, so this swtcx. will fail.....

Because the reservation has been set by the interrupt handler. Read the
scenario again, it's not the reservation from step 1), it's been cleared
twice, it's the stale reservation from an unmatched lwarx from step 6) in
a fainled down_trylock() in the interrupt handler.

Duh, I should go to bed, there is no step 7) in my mail :-)

What I claim is that the stwcx. in transfer_to_handler is useless (but
harmless) but we should always clear the reservation before an rfi in case
a dangling reservation was left by the handler and the return address of
the rfi is between a lwarx and its matching stwcx.

	Regards,
	Gabriel.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:15       ` Dan Malek
  2001-05-07 23:28         ` Gabriel Paubert
@ 2001-05-07 23:35         ` Eli Chen
  2001-05-07 23:36           ` Dan Malek
  2001-05-07 23:40           ` Gabriel Paubert
  1 sibling, 2 replies; 57+ messages in thread
From: Eli Chen @ 2001-05-07 23:35 UTC (permalink / raw)
  To: Dan Malek, Gabriel Paubert; +Cc: brian.kuschak, linuxppc-embedded


> How can this happen?  The reservation for the lwarx in 1) has
> long been broken, so this swtcx. will fail.....
>
> -- Dan

Because reservation is held per processor in the "Reservation bit", and it
doesn't seem like the 405GP checks the reservation address.

>From the PPC manual:
"Because the hardware doesn't compare reservation address when executing the
stwcx. instruction, operating systems software MUST reset the reservation if
an exception or other types of interrupt occurs to insure atomic memory
references of lwarx and stwcx. pairs."

-eli


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:35         ` Eli Chen
@ 2001-05-07 23:36           ` Dan Malek
  2001-05-08  0:16             ` Eli Chen
  2001-05-08  1:37             ` Gabriel Paubert
  2001-05-07 23:40           ` Gabriel Paubert
  1 sibling, 2 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-07 23:36 UTC (permalink / raw)
  To: Eli Chen; +Cc: Gabriel Paubert, brian.kuschak, linuxppc-embedded

Eli Chen wrote:

> >From the PPC manual:
> "Because the hardware doesn't compare reservation address when executing the
> stwcx.

F**K...that's what I was looking for.  What manual is that in?
Everything I have handy (older UISA books), state the granularity
is implementation dependent.  I couldn't find any 4xx manual that
stated the granularity of the reservation.  I thought 6xx/7xx at
least checked cache line granularity in addition to a single
reservation bit.

Gabriel is right................

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:35         ` Eli Chen
  2001-05-07 23:36           ` Dan Malek
@ 2001-05-07 23:40           ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-07 23:40 UTC (permalink / raw)
  To: Eli Chen; +Cc: Dan Malek, brian.kuschak, linuxppc-embedded

On Mon, 7 May 2001, Eli Chen wrote:

> > How can this happen?  The reservation for the lwarx in 1) has
> > long been broken, so this swtcx. will fail.....
> >
> > -- Dan
>
> Because reservation is held per processor in the "Reservation bit", and it
> doesn't seem like the 405GP checks the reservation address.

I don't think any PPC checks the reservation address on stwcx.; the
reservation address is checked on snoops to clear the reservation
bit, but having the reservation bit set is a necessary and
sufficient condition for stwcx. to actually perform the store.

>
> >From the PPC manual:
> "Because the hardware doesn't compare reservation address when executing the
> stwcx. instruction, operating systems software MUST reset the reservation if
> an exception or other types of interrupt occurs to insure atomic memory
> references of lwarx and stwcx. pairs."

Indeed, but as I said this means that you have to clear the reservation on
_return_ from an interrupt in case a dangling reservation is left.
Clearing the reservation on entry is not necessary since the interrupt
handler will never execute a stwcx. without an earlier lwarx.

	Gabriel.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:36           ` Dan Malek
@ 2001-05-08  0:16             ` Eli Chen
  2001-05-08  0:41               ` Dan Malek
  2001-05-08  1:37             ` Gabriel Paubert
  1 sibling, 1 reply; 57+ messages in thread
From: Eli Chen @ 2001-05-08  0:16 UTC (permalink / raw)
  To: Dan Malek; +Cc: Gabriel Paubert, brian.kuschak, linuxppc-embedded

> F**K...that's what I was looking for.  What manual is that in?
> Everything I have handy (older UISA books), state the granularity
> is implementation dependent.  I couldn't find any 4xx manual that
> stated the granularity of the reservation.  I thought 6xx/7xx at
> least checked cache line granularity in addition to a single
> reservation bit.

The book is titled "PowerPC Microprocessor Family: The Programming
Environments".  It's greenish-blue, dated 3/21/2000.  The quote is from the
stwcx. instruction description.  In section 5-4 however, it has this note:

"When a reservation is made to a word in memory by the lwarx instruction, an
address is saved and a reservation is set.  Both of these are necessary for
the memory coherence mechanism, however, some processors do not implement
the address compare for the stwcx. instruction.  Only the reservation need
be established in order of the stwcx. to be successful.  This requires that
exception handlers clear reservations if control is passed to another
program.  Programmers should read the specifications for each individual
processor."

I searched through the 405GP user manual, and it makes no mention of if it
checks the reservation address or not, just like you said.

Eli

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-08  0:40 Brian Kuschak
  0 siblings, 0 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-08  0:40 UTC (permalink / raw)
  To: 'Eli Chen', Dan Malek; +Cc: Gabriel Paubert, linuxppc-embedded

The PPC 405GP manual (Aug 2000) from IBM's website states this in Table A1 -
Instruction Syntax Summary:

stwcx.	RS, RA, RB
Store word (RS) in memory at EA=(RA|0) + (RB) only if reservation bit is
set.
if RESERVE = 1 then
	MS(EA,4) <- (RS)
	RESERVE <- 0
	(CR[CR0]) <- 0 || 1 || XERso
else
	(CR[CR0]) <- 0 || 1 || XERso

Looks like the reservation address is not used for the 405.
-Brian

-----Original Message-----
From: Eli Chen [mailto:eli@routefree.com]
Sent: Monday, May 07, 2001 5:16 PM
To: Dan Malek
Cc: Gabriel Paubert; Brian Kuschak; linuxppc-embedded@lists.linuxppc.org
Subject: Re: dcache BUG()

> F**K...that's what I was looking for.  What manual is that in?
> Everything I have handy (older UISA books), state the granularity
> is implementation dependent.  I couldn't find any 4xx manual that
> stated the granularity of the reservation.  I thought 6xx/7xx at
> least checked cache line granularity in addition to a single
> reservation bit.

The book is titled "PowerPC Microprocessor Family: The Programming
Environments".  It's greenish-blue, dated 3/21/2000.  The quote is from the
stwcx. instruction description.  In section 5-4 however, it has this note:

"When a reservation is made to a word in memory by the lwarx instruction, an
address is saved and a reservation is set.  Both of these are necessary for
the memory coherence mechanism, however, some processors do not implement
the address compare for the stwcx. instruction.  Only the reservation need
be established in order of the stwcx. to be successful.  This requires that
exception handlers clear reservations if control is passed to another
program.  Programmers should read the specifications for each individual
processor."

I searched through the 405GP user manual, and it makes no mention of if it
checks the reservation address or not, just like you said.

Eli

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08  0:16             ` Eli Chen
@ 2001-05-08  0:41               ` Dan Malek
  2001-05-08  1:14                 ` Eli Chen
  0 siblings, 1 reply; 57+ messages in thread
From: Dan Malek @ 2001-05-08  0:41 UTC (permalink / raw)
  To: Eli Chen; +Cc: Gabriel Paubert, brian.kuschak, linuxppc-embedded

Eli Chen wrote:

> The book is titled "PowerPC Microprocessor Family:....

I left that one at home today.

> I searched through the 405GP user manual, and it makes no mention of if it
> checks the reservation address or not, just like you said.

I grabbed my stack of Motorola manuals.  All of the 6xx/7xx/8xx manuals
have a UISA section that describes the behavior of all implementation
dependent instructions.  The lwarx/stwcx behavior is clearly defined,
along with either 16 or 32 word granularity.......I have Gabriel's
patch floating around on a couple of systems for testing.  If you
try it, let me know the results please.

Thanks.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08  1:14                 ` Eli Chen
@ 2001-05-08  1:11                   ` Dan Malek
  2001-05-08 18:01                     ` David Blythe
  0 siblings, 1 reply; 57+ messages in thread
From: Dan Malek @ 2001-05-08  1:11 UTC (permalink / raw)
  To: Eli Chen; +Cc: Gabriel Paubert, brian.kuschak, linuxppc-embedded

Eli Chen wrote:

> .... Perhaps there are other places that returns
> from an interrupt?

No, that's a common return path that should catch all cases.
There may be something else wrong with the Ethernet driver itself.
When I updated it to the 2.4_devel baseline, there were some weird
cache management calls that didn't make sense.  My updates were to
use the standard non-coherent cache management functions, and I
changed the logic to make sense (to me :-).  From this quick update,
I noticed it would be nice to make the transmit more efficient
and higher performance by handling multiple frames, but it should
function properly.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08  0:41               ` Dan Malek
@ 2001-05-08  1:14                 ` Eli Chen
  2001-05-08  1:11                   ` Dan Malek
  0 siblings, 1 reply; 57+ messages in thread
From: Eli Chen @ 2001-05-08  1:14 UTC (permalink / raw)
  To: Dan Malek; +Cc: Gabriel Paubert, brian.kuschak, linuxppc-embedded

I've been running Gabriel's patch for a while now, and I'm still seeing the
"de_put: entry net already free!" messages from running Brian's test
(although seemingly not as often).

I do believe Gabriel is right about clearing the reservation before the rfi,
not in transfer_to_handler.  Perhaps there are other places that returns
from an interrupt?

-eli

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-07 23:36           ` Dan Malek
  2001-05-08  0:16             ` Eli Chen
@ 2001-05-08  1:37             ` Gabriel Paubert
  2001-05-08  1:44               ` Dan Malek
  1 sibling, 1 reply; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-08  1:37 UTC (permalink / raw)
  To: Dan Malek; +Cc: Eli Chen, brian.kuschak, linuxppc-embedded

On Mon, 7 May 2001, Dan Malek wrote:

> Eli Chen wrote:
>
> > >From the PPC manual:
> > "Because the hardware doesn't compare reservation address when executing the
> > stwcx.
>
> F**K...that's what I was looking for.  What manual is that in?
> Everything I have handy (older UISA books), state the granularity
> is implementation dependent.  I couldn't find any 4xx manual that
> stated the granularity of the reservation.  I thought 6xx/7xx at
> least checked cache line granularity in addition to a single
> reservation bit.

Nope. The reservation address register sits on the bus side for snoops,
and is at least on 601 and 603/603e only used to clear the reservation bit
in case of snoop hit. The pem makes it clear that adress chcking is
implementation dependent.

Anyway on SMP the following scenario:

1) processor 1:  sem=1, down(sem) interrupted between lwarx and stwcx.
   reservation set, sem=1 in RAM, value to store 0
2) processor 2: down(sem), sem = 0, clears reservation on processor 1
3) processor 1: interrupt handler ends in down_trylock(sem) which fails
   but sets reservation, sem = -1
4) processor 1: down(sem) finishes succesfully and stores 0 since it
   misses the modification of step 2.
5) Now two processors access the data protected by the same semaphore,
   causing interferences, generally of the destructive kind. The value of
   the semaphore when nobody has acquired it is 2, it has become useless as
   an interlock mechanism.

does screw up with only using a single variable. I've not been able to
find such a scenario on UP, this does not mean it can't be built, however.

Fix: stwcx. at the return from interrupt, again.

> Gabriel is right................

I'm myself more and more convinced myself that this is really a bug, and
a rather serious one. Now the question is whether the patch
I suggested is correct or not. It does not fix Eli's problems, so
either my patch is wrong or something else is going on.

	Gabriel.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08  1:37             ` Gabriel Paubert
@ 2001-05-08  1:44               ` Dan Malek
  0 siblings, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-08  1:44 UTC (permalink / raw)
  To: Gabriel Paubert; +Cc: Eli Chen, brian.kuschak, linuxppc-embedded

Gabriel Paubert wrote:

> I'm myself more and more convinced myself that this is really a bug,

I'm convinced, too, so thanks for the suggestion.  I think it is
much more likely to show up on the 4xx because of my (assumed)
implementation than on other processors that attempt to make some
use of the EA in the instructions.

> .... It does not fix Eli's problems, so
> either my patch is wrong or something else is going on.

We have many people testing on a variety of systems, so we will
know more shortly.

Thanks again.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-08  1:53 Brian Kuschak
  2001-05-08  2:03 ` Dan Malek
  2001-05-08 11:59 ` Gabriel Paubert
  0 siblings, 2 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-08  1:53 UTC (permalink / raw)
  To: 'Dan Malek', Gabriel Paubert; +Cc: Eli Chen, linuxppc-embedded

I've been running with Gabriel's suggested patch for about an hour now, and
I still see the problem.  It hasn't generated an Oops in dcache.h yet, but I
do still see the 'de_put: entry net already free!' messages.

I'll let it run overnight to see if the oops occurs.

Brian

-----Original Message-----
From: Dan Malek [mailto:dan@mvista.com]
Sent: Monday, May 07, 2001 6:44 PM
To: Gabriel Paubert
Cc: Eli Chen; Brian Kuschak; linuxppc-embedded@lists.linuxppc.org
Subject: Re: dcache BUG()

Gabriel Paubert wrote:

> I'm myself more and more convinced myself that this is really a bug,

I'm convinced, too, so thanks for the suggestion.  I think it is
much more likely to show up on the 4xx because of my (assumed)
implementation than on other processors that attempt to make some
use of the EA in the instructions.

> .... It does not fix Eli's problems, so
> either my patch is wrong or something else is going on.

We have many people testing on a variety of systems, so we will
know more shortly.

Thanks again.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08  1:53 Brian Kuschak
@ 2001-05-08  2:03 ` Dan Malek
  2001-05-08 11:59 ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-08  2:03 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: Gabriel Paubert, Eli Chen, linuxppc-embedded

Brian Kuschak wrote:

> ..... but I
> do still see the 'de_put: entry net already free!' messages.

Yuk....I thought that was a network driver problem, but it is really
in the /proc fs stuff.

OK....we'll keep looking.  I still think there are some generic
file system bugs that have been fixed in later kernels....

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-08  3:36 Brian Kuschak
  0 siblings, 0 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-08  3:36 UTC (permalink / raw)
  To: 'Dan Malek '
  Cc: 'Gabriel Paubert ', 'Eli Chen ',
	'linuxppc-embedded@lists.linuxppc.org '

Perhaps it is a filesystem bug, however it's printed when an atomic_read()
of a reference count is unexpectedly zero.  That, plus the fact that I
didn't see it after applying Eli's original patch makes me think we might
still have a problem with atomic operations.

Brian

-----Original Message-----
From: Dan Malek
To: Brian Kuschak
Cc: Gabriel Paubert; Eli Chen; linuxppc-embedded@lists.linuxppc.org
Sent: 5/7/01 7:03 PM
Subject: Re: dcache BUG()

Brian Kuschak wrote:

> ..... but I
> do still see the 'de_put: entry net already free!' messages.

Yuk....I thought that was a network driver problem, but it is really
in the /proc fs stuff.

OK....we'll keep looking.  I still think there are some generic
file system bugs that have been fixed in later kernels....

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
  2001-05-08  1:53 Brian Kuschak
  2001-05-08  2:03 ` Dan Malek
@ 2001-05-08 11:59 ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-08 11:59 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: 'Dan Malek', Eli Chen, linuxppc-embedded

On Mon, 7 May 2001, Brian Kuschak wrote:

> I've been running with Gabriel's suggested patch for about an hour now, and
> I still see the problem.  It hasn't generated an Oops in dcache.h yet, but I
> do still see the 'de_put: entry net already free!' messages.

I think that we agree that my patch fixes a real bug, but that, sadly, it
is _not_ the bug you are seeing. Given the "prevent interrups in atomic
operation" patch that apparently fixes the bug, I looked for possibilities
of stale reservations. Now your bug is even more puzzling...

BTW, I have finally found a scenario on UP which would fail even if stwcx.
checked for the address. It is somewhat contrived:

1) sem=-1, down(sem) interrrupted betwen lwarx and stwcx.,
   reservation set, would set sem to -2
2) interrupt 1 does up(sem), sem=0, another interrupt becomes pending,
   while this handler is executed
3) interrupt 2 does down_trylock(sem), fails and returns,
   dangling reservation.
4) continues with the stwcx. of step 1) stores -2, missing the up(sem),
   goes to sleep. Anybody trying down(sem) probably goes zombie...

I'm not sure that such a made-up example might actually happen in the
kernel. You need a heavily contended semaphore which interrupts or
bottom-halves also try to lock. But that's not the point, it just shows
that checking the address in stwcx. does not actually help much.

> I'll let it run overnight to see if the oops occurs.

Can we get an oops trace if you hit it ? For now I'm puzzled.

	Regards,
	Gabriel.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-08 15:43 Brian Kuschak
  0 siblings, 0 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-08 15:43 UTC (permalink / raw)
  To: 'Dan Malek', Gabriel Paubert; +Cc: Eli Chen, linuxppc-embedded

Well, it ran for about 4 1/2 hours, much longer than before Gabriel's patch,
but eventually failed the same way.

Brian

sh-2.03# while /bin/true ; do cat /proc/net/* 2>/dev/null > /tmp/junk; done
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
de_put: entry net already free!
dentry.d_count = 0
dentry.d_reftime = 134236224 (current = 1634369)
dentry.d_inode = c12badc0
dentry.d_name = net
kernel BUG at /home/brian/linux/include/linux/dcache.h:251!

-----Original Message-----
From: Dan Malek [mailto:dan@mvista.com]
Sent: Monday, May 07, 2001 6:44 PM
To: Gabriel Paubert
Cc: Eli Chen; Brian Kuschak; linuxppc-embedded@lists.linuxppc.org
Subject: Re: dcache BUG()

Gabriel Paubert wrote:

> I'm myself more and more convinced myself that this is really a bug,

I'm convinced, too, so thanks for the suggestion.  I think it is
much more likely to show up on the 4xx because of my (assumed)
implementation than on other processors that attempt to make some
use of the EA in the instructions.

> .... It does not fix Eli's problems, so
> either my patch is wrong or something else is going on.

We have many people testing on a variety of systems, so we will
know more shortly.

Thanks again.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-08 17:43 Brian Kuschak
  2001-05-09 11:06 ` Gabriel Paubert
  0 siblings, 1 reply; 57+ messages in thread
From: Brian Kuschak @ 2001-05-08 17:43 UTC (permalink / raw)
  To: 'Gabriel Paubert'
  Cc: 'Dan Malek', Eli Chen, linuxppc-embedded

> I think that we agree that my patch fixes a real bug, but that, sadly, it
> is _not_ the bug you are seeing. Given the "prevent interrups in atomic
> operation" patch that apparently fixes the bug, I looked for possibilities
> of stale reservations. Now your bug is even more puzzling...

Yes, I agree.  Like I said it definitely runs longer now than it did before,
but unfortunately shows the same symptom.  Here is a little more information
that I gleaned last night which may or may not be useful.

I noticed that arch/ppc/kernel/misc.S still had the (old?) assembly
functions for doing atomic operations.  They are similar but slighly
different to the inline functions in atomic.h.  I ran yesterday for about 4
hours with these "alternative" atomic functions, and I did not see the
failure.  I'm not sure if this was just a coincidence (the timing changed
slightly), or whether it indicates a problem with the inline atomic
functions.

> Can we get an oops trace if you hit it ? For now I'm puzzled.

Sure, I'll post it next time I get it.  (I already cleared the one I got
last night).

Regards,
Brian

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08  1:11                   ` Dan Malek
@ 2001-05-08 18:01                     ` David Blythe
  2001-05-08 20:27                       ` Dan Malek
  0 siblings, 1 reply; 57+ messages in thread
From: David Blythe @ 2001-05-08 18:01 UTC (permalink / raw)
  To: linuxppc-embedded

Dan Malek wrote:

> There may be something else wrong with the Ethernet driver itself.
> When I updated it to the 2.4_devel baseline, there were some weird
> cache management calls that didn't make sense.  My updates were to
> use the standard non-coherent cache management functions, and I
> changed the logic to make sense (to me :-).  From this quick update,
> I noticed it would be nice to make the transmit more efficient
> and higher performance by handling multiple frames, but it should
> function properly.

There are a large number of bugs in the 405 ethernet driver.  We (I)
were waiting until we had resolved this reference count problem, and had
a some workable solution to the starvation under packet floods problem
discussed a few weeks ago with other embedded processors before posting
a patch (i.e., have the driver stand up to reasonable stress tests).
Among the bugs are:

leaks of all the receive buffers, plus other memory on every device
close,
not checking for failed allocations in skb allocations,
poor choice of cache operations when manipulating buffers,
race conditions in data structure access between the rxde and rxeob
interrupt handlers

In either event we had proven to ourselves that this "reference count"
bug happened with other nic cards when used with the 405GP processor, so
we are reasonably certain that it is not specific to the 405 ethernet
driver.  As Eli mentioned ping flooding demonstrates the problem too so
we still believe that it is a generic atomic op problem.  However, we
can only make it happen on our 405GP walnut board(s) and not our
prototype 405GP board(s) (they both have rev D processors).

Just to refresh everyone's memory, the other reference count problems we
were seeing were "Freeing alive device" messages indicating the dev
reference count had gone to zero, and another one in the skb code when
ping flooding with large packet sizes (causing lots of fragments to be
generated).

	david

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08 18:01                     ` David Blythe
@ 2001-05-08 20:27                       ` Dan Malek
  2001-05-08 21:34                         ` David Blythe
  0 siblings, 1 reply; 57+ messages in thread
From: Dan Malek @ 2001-05-08 20:27 UTC (permalink / raw)
  To: David Blythe; +Cc: linuxppc-embedded

David Blythe wrote:

> ....  However, we
> can only make it happen on our 405GP walnut board(s) and not our
> prototype 405GP board(s) (they both have rev D processors).

Oh, great :-).....Are there any obvious differences, like clock
speed, memory type or configuration, etc.?

If someone gets a chance, would you give the FSM labs linuxppc_2_5
sources a whirl before they disappear?  I don't expect it to be
perfect, but it would sure be nice debugging something else for
a change :-).

I modified the Ethernet driver cache stuff at least.....

Thanks.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08 20:27                       ` Dan Malek
@ 2001-05-08 21:34                         ` David Blythe
  2001-05-08 21:49                           ` Dan Malek
  0 siblings, 1 reply; 57+ messages in thread
From: David Blythe @ 2001-05-08 21:34 UTC (permalink / raw)
  To: linuxppc-embedded

Dan Malek wrote:
>
> David Blythe wrote:
>
> > ....  However, we
> > can only make it happen on our 405GP walnut board(s) and not our
> > prototype 405GP board(s) (they both have rev D processors).
>
> Oh, great :-).....Are there any obvious differences, like clock
> speed, memory type or configuration, etc.?

Yes, there are some differences.  Memory speed is slower at the moment
(board is still in bringup stage).  There are other subtle differences
as well.  I believe Brian also saw the bug on a non-walnut 405GP.  I was
kinda hoping it was specific to some hardware, but i don't see any
pattern.  It seems that the bug is very sensitive to timing, at least
from watching it seeminly disappear as Eli made small changes to the
atomic ops or related code, as well as Brian's comments about using the
non-inline form of the atomic ops.

>
> If someone gets a chance, would you give the FSM labs linuxppc_2_5
> sources a whirl before they disappear?  I don't expect it to be
> perfect, but it would sure be nice debugging something else for
> a change :-).

Should it run out of the box?  I can give it a try if you have some
confidence it should build and and run a shell on a walnut board.

	thanks
	david

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08 21:34                         ` David Blythe
@ 2001-05-08 21:49                           ` Dan Malek
  2001-05-08 22:34                             ` Ira Weiny
  0 siblings, 1 reply; 57+ messages in thread
From: Dan Malek @ 2001-05-08 21:49 UTC (permalink / raw)
  To: David Blythe; +Cc: linuxppc-embedded

David Blythe wrote:

> Should it run out of the box?  I can give it a try if you have some
> confidence it should build and and run a shell on a walnut board.

Oh, yes, it will get to a shell prompt.  I just don't know what
else you may need to run some of these tests, or what you may
discover later :-).  There have been lots of PCI updates that are
in the pipeline for the upcoming 2.4/2.4_devel that aren't part of
the linuxppc_2_5 tree.  It basically makes the PCI look like all
other PowerPC PCI systems.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08 21:49                           ` Dan Malek
@ 2001-05-08 22:34                             ` Ira Weiny
  2001-05-08 22:53                               ` Dan Malek
  0 siblings, 1 reply; 57+ messages in thread
From: Ira Weiny @ 2001-05-08 22:34 UTC (permalink / raw)
  To: Dan Malek, linxuppc-emb


Dan Malek wrote:

> Oh, yes, it will get to a shell prompt.  I just don't know what
> else you may need to run some of these tests, or what you may
> discover later :-).  There have been lots of PCI updates that are
> in the pipeline for the upcoming 2.4/2.4_devel that aren't part of
> the linuxppc_2_5 tree.  It basically makes the PCI look like all
> other PowerPC PCI systems.

When is this comming out?  I heard 2.0 has been pushed back to June?

Ira Weiny


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-08 22:34                             ` Ira Weiny
@ 2001-05-08 22:53                               ` Dan Malek
  0 siblings, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-08 22:53 UTC (permalink / raw)
  To: Ira Weiny; +Cc: linxuppc-emb

Ira Weiny wrote:

> When is this comming out?  I heard 2.0 has been pushed back to June?

When is what coming out?  The changes for the 4xx/Walnut are in the
pipeline back to FSM Labs.  Once the 2.4/2.4_devel/linuxppc_2_5
stuff is straightened out it should pop up there.

You mean HHL 2.0 from MontaVista?  Creating a CD with supported
software, tools, and applications is a little different than
throwing a couple of 4xx PCI functions over the fence for you to
play with :-).  There are many people doing lots of software for that
release, but it doesn't change the way we work on software in the
public source trees.  There are some substantial changes among those
FSM Labs trees for the 4xx, and it takes a while to sort that out.
It doesn't do any good for me to put something there that doesn't
work, because then all I will do is answer those questions here :-).

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
  2001-05-08 17:43 Brian Kuschak
@ 2001-05-09 11:06 ` Gabriel Paubert
  0 siblings, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-09 11:06 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: 'Dan Malek', Eli Chen, linuxppc-embedded


On Tue, 8 May 2001, Brian Kuschak wrote:

> Yes, I agree.  Like I said it definitely runs longer now than it did before,
> but unfortunately shows the same symptom.  Here is a little more information
> that I gleaned last night which may or may not be useful.

Ok, so you were hitting 2 bugs (or more, shudder).

> I noticed that arch/ppc/kernel/misc.S still had the (old?) assembly
> functions for doing atomic operations.  They are similar but slighly
> different to the inline functions in atomic.h.  I ran yesterday for about 4
> hours with these "alternative" atomic functions, and I did not see the
> failure.  I'm not sure if this was just a coincidence (the timing changed
> slightly), or whether it indicates a problem with the inline atomic
> functions.

Interesting, which compiler are you using? I've been experimenting with my
own modifications to gcc, in case I hit a problem I often do objdump
--disassemble vmlinux | less and search the instruction in which I'm
interested. In your case it would be lwarx, but there are probably too
many occurrences to make the hunt worthwhile.

Note that the semaphore code has been modified over and over again
recently to make it work (I'm still unable too convince me that it is
correct, I need to study it more). But the semaphore code should be
independent of reference count handling.


> Sure, I'll post it next time I get it.  (I already cleared the one I got
> last night).

Thanks,

	Regards,
	Gabriel.


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-09 16:40 Brian Kuschak
  2001-05-09 18:31 ` Dan Malek
  2001-05-09 19:18 ` Gabriel Paubert
  0 siblings, 2 replies; 57+ messages in thread
From: Brian Kuschak @ 2001-05-09 16:40 UTC (permalink / raw)
  To: 'Gabriel Paubert'
  Cc: 'Dan Malek', Eli Chen, linuxppc-embedded

> Interesting, which compiler are you using? I've been experimenting with my
> own modifications to gcc, in case I hit a problem I often do objdump
> --disassemble vmlinux | less and search the instruction in which I'm
> interested. In your case it would be lwarx, but there are probably too
> many occurrences to make the hunt worthwhile.

I'm using gcc version 2.95.2 19991030 (2.95.3 prerelease/franzo), from
MontaVista.
Your right, one of the first things I did was look at the generated assembly
code, that's how I noticed the one-bit difference between bne and bne-.
Here's a little snippet from one of the atomic ops.  It looks ok to me,
except for the fact that lwarx r11,0,r31 shows up as lwarx r11,r0,r31.
Objdump seems to do this everywhere, I'm not sure why.

static __inline__ void atomic_set(atomic_t *v, int a)
{
c004f9e8:       38 00 00 01     li      r0,1
        int t;

        __asm__ __volatile__("\n\
c004f9ec:       7d 60 f8 28     lwarx   r11,r0,r31
c004f9f0:       60 0b 00 00     ori     r11,r0,0
c004f9f4:       7d 60 f9 2d     stwcx.  r11,r0,r31
c004f9f8:       40 a2 ff f4     bne-    c004f9ec <d_alloc+0x90>

        atomic_set(&dentry->d_count, 1);

Regards,
Brian

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-09 16:40 Brian Kuschak
@ 2001-05-09 18:31 ` Dan Malek
  2001-05-09 19:18 ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-09 18:31 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: 'Gabriel Paubert', Eli Chen, linuxppc-embedded

Brian Kuschak wrote:

> ... that's how I noticed the one-bit difference between bne and bne-.

That's the branch target hint due to the use of the '-' in the instruction.

> .... that lwarx r11,0,r31 shows up as lwarx r11,r0,r31.

That's because it really is a register field, with r0 handled as
a special case.

I've dumped out lots of your kernel image around atomic ops, and I
haven't seen anything wrong with the generated code.  That would be
too easy :-).

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
  2001-05-09 16:40 Brian Kuschak
  2001-05-09 18:31 ` Dan Malek
@ 2001-05-09 19:18 ` Gabriel Paubert
  2001-05-10 18:39   ` Frank Rowand
  1 sibling, 1 reply; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-09 19:18 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: 'Dan Malek', Eli Chen, linuxppc-embedded

On Wed, 9 May 2001, Brian Kuschak wrote:

> I'm using gcc version 2.95.2 19991030 (2.95.3 prerelease/franzo), from
> MontaVista.
> Your right, one of the first things I did was look at the generated assembly
> code, that's how I noticed the one-bit difference between bne and bne-.
> Here's a little snippet from one of the atomic ops.  It looks ok to me,
> except for the fact that lwarx r11,0,r31 shows up as lwarx r11,r0,r31.

Yes, it's ok.

> Objdump seems to do this everywhere, I'm not sure why.

Just because objdump prints r0 instead of 0 systematically when
disassembling instructions, even in the case where the contents of the
register are ignored and replaced by 0. Not a problem, you can also write
the assembly source with r0 instead of 0 if you want. As long as it means
that the corresponding field in the instruction enconding is zero, it does
not matter.

>
> static __inline__ void atomic_set(atomic_t *v, int a)
> {
> c004f9e8:       38 00 00 01     li      r0,1
>         int t;
>
>         __asm__ __volatile__("\n\
> c004f9ec:       7d 60 f8 28     lwarx   r11,r0,r31
> c004f9f0:       60 0b 00 00     ori     r11,r0,0
> c004f9f4:       7d 60 f9 2d     stwcx.  r11,r0,r31
> c004f9f8:       40 a2 ff f4     bne-    c004f9ec <d_alloc+0x90>
>
>         atomic_set(&dentry->d_count, 1);

Is there any reason for atomic_set to use this sequence. I believe that a
simple store (stw in this case) would be ok. This looks like a very
convoluted and bloated way to set a variable. An aligned stw is guaranteed
to set the variable atomically wrt all other processors.

Hmmm, now thinking about the problem we are tracking. Is it possible that
we have the same problem when switching tasks ?  After all there is an rfi
there to switch contexts and I've been unable to spot a place where a
stale reservation would be cancelled in schedule on UP (it would be
cancelled on SMP because of spinlocks). Note that in many cases there are
atomic operations, but I believe that it is not guaranteed depending
on whether kernel threads are involved or not.

However, it should not affect the kernel since there should be no process
switching between lwarx and stwcx., unless you ar using a preemptible
kernel. However, this seems to be a bug which could affect multithreaded
applications or appliactions with atomic operations on shared memory:

1) application interrupted between lwarx and stwcx.

2) switch_to another application

3) switch_to back to this application with reservaton set

4) returns to the application with reservation set, stwcx. succeeds even
if the shared variable has been modified...

Patch follows, it includes my previous patch. I don't think that it will
solve your problem, unless you happen to use a preemptible kernel.
While I was at it,  I removed the useless stwcx. in transfer_to_handler.

	Regards,
	Gabriel.

===== head.S 1.13 vs edited =====
--- 1.13/arch/ppc/kernel/head.S	Fri Apr 13 20:44:42 2001
+++ edited/head.S	Wed May  9 20:53:20 2001
@@ -765,8 +765,6 @@
 	mflr	r23
 	andi.	r24,r23,0x3f00		/* get vector offset */
 	stw	r24,TRAP(r21)
-	li	r22,RESULT
-	stwcx.	r22,r22,r21		/* to clear the reservation */
 	li	r22,0
 	stw	r22,RESULT(r21)
 	mtspr	SPRG2,r22		/* r1 is now kernel sp */
===== entry.S 1.7 vs edited =====
--- 1.7/arch/ppc/kernel/entry.S	Fri Apr 13 20:44:42 2001
+++ edited/entry.S	Wed May  9 20:52:41 2001
@@ -266,6 +266,7 @@
 10:	lwz	r2,_CTR(r1)
 	lwz	r0,_LINK(r1)
 	mtctr	r2
+	stwcx.	r1,0,r1			/* Clear reservation - Gabriel. */
 	mtlr	r0
 	lwz	r2,_XER(r1)
 	lwz	r0,_CCR(r1)
@@ -382,6 +383,7 @@
 	CLR_TOP32(r8)
 	mtspr	SPRG2,r8		/* phys exception stack pointer */
 1:
+	stwcx.	r1,0,r1			/* Clear reservation - Gabriel. */
 	lwz	r3,_CTR(r1)
 	lwz	r0,_LINK(r1)
 	mtctr	r3
===== head_4xx.S 1.3 vs edited =====
--- 1.3/arch/ppc/kernel/head_4xx.S	Mon Apr  2 03:36:29 2001
+++ edited/head_4xx.S	Wed May  9 20:53:39 2001
@@ -387,8 +387,6 @@
 	mflr	r23
 	andi.	r24,r23,0x3f00		# Get vector offset
 	stw	r24,TRAP(r21)
-	li	r22,RESULT
-	stwcx.	r22,r22,r21		# Clear the reservation
 	li	r22,0
 	stw	r22,RESULT(r21)
 	mtspr	SPRN_SPRG2,r22		# r1 is now the kernel stack pointer
===== head_8xx.S 1.6 vs edited =====
--- 1.6/arch/ppc/kernel/head_8xx.S	Wed Apr 11 12:10:57 2001
+++ edited/head_8xx.S	Wed May  9 20:53:54 2001
@@ -678,8 +678,6 @@
 	mflr	r23
 	andi.	r24,r23,0x3f00		/* get vector offset */
 	stw	r24,TRAP(r21)
-	li	r22,RESULT
-	stwcx.	r22,r22,r21		/* to clear the reservation */
 	li	r22,0
 	stw	r22,RESULT(r21)
 	mtspr	SPRG2,r22		/* r1 is now kernel sp */

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-09 19:18 ` Gabriel Paubert
@ 2001-05-10 18:39   ` Frank Rowand
  2001-05-10 18:49     ` Gabriel Paubert
  0 siblings, 1 reply; 57+ messages in thread
From: Frank Rowand @ 2001-05-10 18:39 UTC (permalink / raw)
  To: Gabriel Paubert
  Cc: Brian Kuschak, 'Dan Malek', Eli Chen, linuxppc-embedded


Gabriel Paubert wrote:
>
> On Wed, 9 May 2001, Brian Kuschak wrote:


> >
> > static __inline__ void atomic_set(atomic_t *v, int a)
> > {
> > c004f9e8:       38 00 00 01     li      r0,1
> >         int t;
> >
> >         __asm__ __volatile__("\n\
> > c004f9ec:       7d 60 f8 28     lwarx   r11,r0,r31
> > c004f9f0:       60 0b 00 00     ori     r11,r0,0
> > c004f9f4:       7d 60 f9 2d     stwcx.  r11,r0,r31
> > c004f9f8:       40 a2 ff f4     bne-    c004f9ec <d_alloc+0x90>
> >
> >         atomic_set(&dentry->d_count, 1);
>
> Is there any reason for atomic_set to use this sequence. I believe that a
> simple store (stw in this case) would be ok. This looks like a very
> convoluted and bloated way to set a variable. An aligned stw is guaranteed
> to set the variable atomically wrt all other processors.

Sorry I wasn't around for the beginning of this discussion (I was off with
visiting family...), but I'll jump in now.

I put this version of atomic_set() into Brian's source.  It is one of the
things that helped reduce the severity of the dcache symptoms.  You can't
just use a stw in atomic_set(), because the other atomic operations depend
upon the stwcx.

-Frank
--
Frank Rowand <frank_rowand@mvista.com>
MontaVista Software, Inc

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 18:39   ` Frank Rowand
@ 2001-05-10 18:49     ` Gabriel Paubert
  2001-05-10 19:10       ` Frank Rowand
  2001-05-10 20:56       ` Dan Malek
  0 siblings, 2 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-10 18:49 UTC (permalink / raw)
  To: frowand; +Cc: Brian Kuschak, 'Dan Malek', Eli Chen, linuxppc-embedded


On Thu, 10 May 2001, Frank Rowand wrote:

> Gabriel Paubert wrote:
> >
> > On Wed, 9 May 2001, Brian Kuschak wrote:
>
>
> > >
> > > static __inline__ void atomic_set(atomic_t *v, int a)
> > > {
> > > c004f9e8:       38 00 00 01     li      r0,1
> > >         int t;
> > >
> > >         __asm__ __volatile__("\n\
> > > c004f9ec:       7d 60 f8 28     lwarx   r11,r0,r31
> > > c004f9f0:       60 0b 00 00     ori     r11,r0,0
> > > c004f9f4:       7d 60 f9 2d     stwcx.  r11,r0,r31
> > > c004f9f8:       40 a2 ff f4     bne-    c004f9ec <d_alloc+0x90>
> > >
> > >         atomic_set(&dentry->d_count, 1);
> >
> > Is there any reason for atomic_set to use this sequence. I believe that a
> > simple store (stw in this case) would be ok. This looks like a very
> > convoluted and bloated way to set a variable. An aligned stw is guaranteed
> > to set the variable atomically wrt all other processors.
>
> Sorry I wasn't around for the beginning of this discussion (I was off with
> visiting family...), but I'll jump in now.
>
> I put this version of atomic_set() into Brian's source.  It is one of the
> things that helped reduce the severity of the dcache symptoms.  You can't
> just use a stw in atomic_set(), because the other atomic operations depend
> upon the stwcx.

Why not ? I'd like to find an explanation of a possible failure mode.
All PPC systems have always used a simple store for atomic_set. If it does
not work, there is something seriously wrong, perhaps even a hardware bug.

This is especially true on a UP system. Whatever value is stored by a stw
should be seen by any following lwarx/stwcx., on SMP you may need an
eieio. But on UP I can't see how it can affect anything.

Did it actually have any effect on Brian's system ?

	Gabriel.


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 18:49     ` Gabriel Paubert
@ 2001-05-10 19:10       ` Frank Rowand
  2001-05-11  4:23         ` Paul Mielke
  2001-05-11 10:09         ` Gabriel Paubert
  2001-05-10 20:56       ` Dan Malek
  1 sibling, 2 replies; 57+ messages in thread
From: Frank Rowand @ 2001-05-10 19:10 UTC (permalink / raw)
  To: Gabriel Paubert
  Cc: frowand, Brian Kuschak, 'Dan Malek', Eli Chen,
	linuxppc-embedded


Gabriel Paubert wrote:
>
> On Thu, 10 May 2001, Frank Rowand wrote:
>
> > Gabriel Paubert wrote:
> > >
> > > On Wed, 9 May 2001, Brian Kuschak wrote:
> >
> >
> > > >
> > > > static __inline__ void atomic_set(atomic_t *v, int a)
> > > > {
> > > > c004f9e8:       38 00 00 01     li      r0,1
> > > >         int t;
> > > >
> > > >         __asm__ __volatile__("\n\
> > > > c004f9ec:       7d 60 f8 28     lwarx   r11,r0,r31
> > > > c004f9f0:       60 0b 00 00     ori     r11,r0,0
> > > > c004f9f4:       7d 60 f9 2d     stwcx.  r11,r0,r31
> > > > c004f9f8:       40 a2 ff f4     bne-    c004f9ec <d_alloc+0x90>
> > > >
> > > >         atomic_set(&dentry->d_count, 1);
> > >
> > > Is there any reason for atomic_set to use this sequence. I believe that a
> > > simple store (stw in this case) would be ok. This looks like a very
> > > convoluted and bloated way to set a variable. An aligned stw is guaranteed
> > > to set the variable atomically wrt all other processors.
> >
> > Sorry I wasn't around for the beginning of this discussion (I was off with
> > visiting family...), but I'll jump in now.
> >
> > I put this version of atomic_set() into Brian's source.  It is one of the
> > things that helped reduce the severity of the dcache symptoms.  You can't
> > just use a stw in atomic_set(), because the other atomic operations depend
> > upon the stwcx.
>
> Why not ? I'd like to find an explanation of a possible failure mode.
> All PPC systems have always used a simple store for atomic_set. If it does
> not work, there is something seriously wrong, perhaps even a hardware bug.
>
> This is especially true on a UP system. Whatever value is stored by a stw
> should be seen by any following lwarx/stwcx., on SMP you may need an
> eieio. But on UP I can't see how it can affect anything.

>From the "PowerPC 405GP Embedded Processor User's Manual", in the "Instruction
Set" chapter (which describes each instruction), the Programming Note for lwarx
says:

  lwarx and the stwcx. instruction should be paired in a loop, as shown in the
  following example, to create the effect of an atomic operation to a memory
  area used as a semaphore between asynchronous processes.  Only lwarx can set
  the reservation bit to 1.  stwcx. sets the reservation bit to 0 upon its
  completion, whether or not stwcx. sent (RS) to memory.  CR[CR0]EQ must be
  examined to determine whether (RS) was sent to memory.

    loop: lwarx  # read the semaphore from memory; set reservation
    "alter"      # change the semaphore bits in register as required
    stwcx.       # attempt to store semaphore; reset reservation
    bne loop     # an asynchronous process has intervened; try again

  If the asynchronous process in the code example had paired lwarx with a
  store other than stwcx., the reservation bit would not have been cleared
  in the asynchronous process, and the code example would have overwritten
  the semaphore.



So if the lwarx occurs,

then an interrupt alters the flow of execution,
and the interrupt handler uses a stw to implement atomic_set(),

then the interrupt handler returns to the original flow of execution,

then the stwcx. succeeds, even though the value of the semaphore was
altered by the atomic_set().


> Did it actually have any effect on Brian's system ?

Changing atomic_set() to use lwarx / stwcx. instead of stw had an
effect on my 405GP systems here (including the Walnut and also
the same custom board that Brian is using).


-Frank
--
Frank Rowand <frank_rowand@mvista.com>
MontaVista Software, Inc

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 18:49     ` Gabriel Paubert
  2001-05-10 19:10       ` Frank Rowand
@ 2001-05-10 20:56       ` Dan Malek
  2001-05-10 23:14         ` Cort Dougan
  2001-05-11 10:57         ` Gabriel Paubert
  1 sibling, 2 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-10 20:56 UTC (permalink / raw)
  To: Gabriel Paubert; +Cc: frowand, Brian Kuschak, Eli Chen, linuxppc-embedded

Gabriel Paubert wrote:

> Why not ? I'd like to find an explanation of a possible failure mode.

Because the 4xx sucks........unlike other PowerPC processors, it
doesn't appear to use any of the reservation address to break or
match a lwarx.

> All PPC systems have always used a simple store for atomic_set. If it does
> not work, there is something seriously wrong, perhaps even a hardware bug.

Yeah, it does sound kind of broken, but then the 4xx isn't any
shining example of something that follows the PowerPC architecture.
Other PowerPCs have a reservation granularity, so _any_ store operation
within this will cause the reservation to be broken.  The 4xx seems
to have no granularity, and further has inverted the logic.  Without
granularity _any_ store operation anywhere should break the reservation,
but in this case no store operation will break it.....very bad.

What can happen is a lwarx to an address, in some other context a
simple store to that address (no reservation broken) then a subsequent
stwcx. to the address will appear successful.  Hmmmm.....several
ways to fix it, I wonder what will work best.....

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-10 21:20 Brian Kuschak
  2001-05-10 21:26 ` Dan Malek
  0 siblings, 1 reply; 57+ messages in thread
From: Brian Kuschak @ 2001-05-10 21:20 UTC (permalink / raw)
  To: 'Dan Malek', Gabriel Paubert; +Cc: frowand, Eli Chen, linuxppc-embedded

> What can happen is a lwarx to an address, in some other context a
> simple store to that address (no reservation broken) then a subsequent
> stwcx. to the address will appear successful.  Hmmmm.....several
> ways to fix it, I wonder what will work best.....

Yes, this sounds bad.  However, I wonder if it's really the cause of this
particular bug.  I cannot find any code which does a regular store to the
d_count variables.  All writes to these counters appear to use the atomic
ops.

Brian

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 21:20 Brian Kuschak
@ 2001-05-10 21:26 ` Dan Malek
  0 siblings, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-10 21:26 UTC (permalink / raw)
  To: Brian Kuschak; +Cc: Gabriel Paubert, frowand, Eli Chen, linuxppc-embedded

Brian Kuschak wrote:

> ....  I cannot find any code which does a regular store to the
> d_count variables.  All writes to these counters appear to use the atomic
> ops.

Yeah, but there are "atomic" ops/functions/macros that underneath
can do simple read/writes without using the PowerPC reservation
instructions (because they haven't been needed on other processors).

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 20:56       ` Dan Malek
@ 2001-05-10 23:14         ` Cort Dougan
  2001-05-11 11:01           ` Gabriel Paubert
  2001-05-11 10:57         ` Gabriel Paubert
  1 sibling, 1 reply; 57+ messages in thread
From: Cort Dougan @ 2001-05-10 23:14 UTC (permalink / raw)
  To: Dan Malek
  Cc: Gabriel Paubert, frowand, Brian Kuschak, Eli Chen,
	linuxppc-embedded

Dan... your arguments aside, the 4xx is a BookE processor and IBM says that
"BookE is good".  Aren't IBM assurances enough for you?

I'd claim that BookE isn't PowerPC, myself.  I'm sad that Linux/PPC
supports that pile of crap.

} Because the 4xx sucks........unlike other PowerPC processors, it
} doesn't appear to use any of the reservation address to break or
} match a lwarx.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 19:10       ` Frank Rowand
@ 2001-05-11  4:23         ` Paul Mielke
  2001-05-11 10:09         ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Paul Mielke @ 2001-05-11  4:23 UTC (permalink / raw)
  To: frowand, Gabriel Paubert
  Cc: frowand, Brian Kuschak, 'Dan Malek', Eli Chen,
	linuxppc-embedded

At 12:10 PM 5/10/01 -0700, Frank Rowand wrote:

>So if the lwarx occurs,
>
>then an interrupt alters the flow of execution,
>and the interrupt handler uses a stw to implement atomic_set(),
>
>then the interrupt handler returns to the original flow of execution,
>
>then the stwcx. succeeds, even though the value of the semaphore was
>altered by the atomic_set().

Frank,

I don't see how the above example can actually cause a failure.  Isn't it
a fundamental assumption that any change of process context must
clear the reservation bit?  If you don't do that, then none of this works.

So just before the RFI at step 4) of your example, the reservation bit
should be cleared and then the stwcx in the interrupted context fails
as one would expect.

Paul Mielke                        paulm@routefree.com
RouteFree, Inc.                   (650) 739-5377

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 19:10       ` Frank Rowand
  2001-05-11  4:23         ` Paul Mielke
@ 2001-05-11 10:09         ` Gabriel Paubert
  1 sibling, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-11 10:09 UTC (permalink / raw)
  To: frowand; +Cc: Brian Kuschak, 'Dan Malek', Eli Chen, linuxppc-embedded


On Thu, 10 May 2001, Frank Rowand wrote:

> Gabriel Paubert wrote:
> >
> > Why not ? I'd like to find an explanation of a possible failure mode.
> > All PPC systems have always used a simple store for atomic_set. If it does
> > not work, there is something seriously wrong, perhaps even a hardware bug.
> >
> > This is especially true on a UP system. Whatever value is stored by a stw
> > should be seen by any following lwarx/stwcx., on SMP you may need an
> > eieio. But on UP I can't see how it can affect anything.
>
> >From the "PowerPC 405GP Embedded Processor User's Manual", in the "Instruction
> Set" chapter (which describes each instruction), the Programming Note for lwarx
> says:
>
>   lwarx and the stwcx. instruction should be paired in a loop, as shown in the
>   following example, to create the effect of an atomic operation to a memory
>   area used as a semaphore between asynchronous processes.  Only lwarx can set
>   the reservation bit to 1.  stwcx. sets the reservation bit to 0 upon its
>   completion, whether or not stwcx. sent (RS) to memory.  CR[CR0]EQ must be
>   examined to determine whether (RS) was sent to memory.
>
>     loop: lwarx  # read the semaphore from memory; set reservation
>     "alter"      # change the semaphore bits in register as required
>     stwcx.       # attempt to store semaphore; reset reservation
>     bne loop     # an asynchronous process has intervened; try again
>
>   If the asynchronous process in the code example had paired lwarx with a
>   store other than stwcx., the reservation bit would not have been cleared
>   in the asynchronous process, and the code example would have overwritten
>   the semaphore.
>
>
>
> So if the lwarx occurs,
>
> then an interrupt alters the flow of execution,
> and the interrupt handler uses a stw to implement atomic_set(),
>
> then the interrupt handler returns to the original flow of execution,
>
> then the stwcx. succeeds, even though the value of the semaphore was
> altered by the atomic_set().

The solution to this is to guarantee that the reservation is always lost
on return from interrupt. This is exactly what my patch does, but it does
it just before the rfi, guaranteeing that the reservation is lost in all
cases, even if you have a down_trylock on another path which could return
with a stale reservation.

> > Did it actually have any effect on Brian's system ?
>
> Changing atomic_set() to use lwarx / stwcx. instead of stw had an
> effect on my 405GP systems here (including the Walnut and also
> the same custom board that Brian is using).

Please try the last patch I sent to the list instead and report. It should
have the same effect and protect against other (actually all AFAICT) cases
of stale reservations.

I might still have missed some cases, but it won't bloat the atomic_set()
macros and handle correctly the case of an interrupt that ends with
a failing down_trylock or spin_trylock just before returning.

	Regards,
	Gabriel.


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 20:56       ` Dan Malek
  2001-05-10 23:14         ` Cort Dougan
@ 2001-05-11 10:57         ` Gabriel Paubert
  2001-05-11 18:49           ` Dan Malek
  1 sibling, 1 reply; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-11 10:57 UTC (permalink / raw)
  To: Dan Malek; +Cc: frowand, Brian Kuschak, Eli Chen, linuxppc-embedded

On Thu, 10 May 2001, Dan Malek wrote:

> Gabriel Paubert wrote:
>
> > Why not ? I'd like to find an explanation of a possible failure mode.
>
> Because the 4xx sucks........unlike other PowerPC processors, it
> doesn't appear to use any of the reservation address to break or
> match a lwarx.

Nother do 601, 603, and a lot of others...

>
> > All PPC systems have always used a simple store for atomic_set. If it does
> > not work, there is something seriously wrong, perhaps even a hardware bug.
>
> Yeah, it does sound kind of broken, but then the 4xx isn't any
> shining example of something that follows the PowerPC architecture.
> Other PowerPCs have a reservation granularity, so _any_ store operation
> within this will cause the reservation to be broken.  The 4xx seems
> to have no granularity, and further has inverted the logic.  Without
> granularity _any_ store operation anywhere should break the reservation,
> but in this case no store operation will break it.....very bad.

I am confused, but stores from the local processor _never_ clear the
reservation (at least on all processors I have testes) and this very
clearly documented. The only two operations which are guaranteed to clear
the reservation are:

a) stwcx.

b) snoops for writes to an address within the reservation granule

The reservation address is monitored on the external bus, not on the
internal side.

For a proof I just tested the sequence lwarx + stw + sync (just in
case) + stwcx. to the same address as well as stwcx. at a different
address from the lwarx on 7400, 750 and 603e. They are all succesful.
I have appended the test code so you can check it.

> What can happen is a lwarx to an address, in some other context a
> simple store to that address (no reservation broken) then a subsequent
> stwcx. to the address will appear successful.  Hmmmm.....several
> ways to fix it, I wonder what will work best.....

Not even "in some other contexts". I think that moving the stwcx. from
transfer_to_handler to interrupt returns solves this, since it guarantees
that you never let a stale reservation mess up with an interrupted atomic
sequence.  Unless of course you have a hardware bug and stwcx. does not
clear the reservation in some cases, which would be serious enough to
say that this part is not supported.

If I miss something, I'd like a scenario like the ones I posted on this
thread to be enlightened. Especially a scenario which would fail with my
patch applied.

	Regards,
	Gabriel.

Code to verify:
a) that local stores do not clear the reservation.
b) that stwcx. does not check the address

#include <stdio.h>

static volatile int atom[1024];

int main(int argc, char**argv)
{
    	int tmp;
	atom[0] = 0;
	/* Check for stw between lwarx and stwcx. */
    	asm("\n\
	lwarx %0,0,%2\n\
	stw %3,0(%2)\n\
	sync\n\
	addi %0,%0,2
	stwcx. %0,0,%2\n
	mfcr %0\n
	rlwinm %0,%0,3,1"
	: "=&b" (tmp), "=m" (atom[0])
	    : "r" (atom+0), "r" (4), "m" (atom[0])
	: "cr0");
	printf("Last stored value is %d, stwcx. flag=%d\n", atom[0], tmp);
	/* Check for stwcx. at a different address from lwarx */
	atom[0] = -1;
	atom[512] = 0;
    	asm("\n\
	lwarx %0,0,%2\n\
	stwcx. %0,0,%3\n
	mfcr %0\n
	rlwinm %0,%0,3,1"
	: "=&r" (tmp), "=m" (atom[512])
	: "r" (atom+0), "r" (atom+512), "m" (atom[0])
	: "cr0");
	printf("Last stored value is %d, stwcx. flag=%d\n", atom[512], tmp);
}

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-10 23:14         ` Cort Dougan
@ 2001-05-11 11:01           ` Gabriel Paubert
  0 siblings, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-11 11:01 UTC (permalink / raw)
  To: Cort Dougan
  Cc: Dan Malek, frowand, Brian Kuschak, Eli Chen, linuxppc-embedded


On Thu, 10 May 2001, Cort Dougan wrote:

> Dan... your arguments aside, the 4xx is a BookE processor and IBM says that
> "BookE is good".  Aren't IBM assurances enough for you?
>
> I'd claim that BookE isn't PowerPC, myself.  I'm sad that Linux/PPC
> supports that pile of crap.

I fully agree with both statements. I don't want to touch any BookE
processor, especially the 64 bit ones.

	Gabriel.


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-11 10:57         ` Gabriel Paubert
@ 2001-05-11 18:49           ` Dan Malek
  0 siblings, 0 replies; 57+ messages in thread
From: Dan Malek @ 2001-05-11 18:49 UTC (permalink / raw)
  To: Gabriel Paubert; +Cc: frowand, Brian Kuschak, Eli Chen, linuxppc-embedded

Gabriel Paubert wrote:
>

> Nother do 601, 603, and a lot of others...

Oh, I believe they do..........but, I haven't been wrong lately,
so maybe it's my turn again :-).

> For a proof I just tested the sequence lwarx + stw + sync ....

OK, so I learned my one thing for the day, I'm going home now...

> Not even "in some other contexts".

That was a bad choice of words on my part.  I should have used
"function scope" or something like that.  I was responding to about
three different messages at the time, sorry.

	-- Dan

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: dcache BUG()
@ 2001-05-12  0:44 Brian Kuschak
  2001-05-12  0:57 ` Eli Chen
  0 siblings, 1 reply; 57+ messages in thread
From: Brian Kuschak @ 2001-05-12  0:44 UTC (permalink / raw)
  To: 'Gabriel Paubert'
  Cc: 'Dan Malek', Eli Chen, linuxppc-embedded

Hi Gabriel,

I tried your patch, but as you expected it didn't fix the problem.  I ran it
twice, once with my kernel as I got it from MontaVista, and once with the
atomic_set() function using regular stw (as it is in the offical sources, I
believe).

In both cases I got the de_put messages in a few minutes, and once I got the
dentry BUG().

I think Eli might have had better luck, but I'm not sure how long he was
running the test.

Regards,
Brian

> Patch follows, it includes my previous patch. I don't think that it will
> solve your problem, unless you happen to use a preemptible kernel.
> While I was at it,  I removed the useless stwcx. in transfer_to_handler.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-12  0:44 Brian Kuschak
@ 2001-05-12  0:57 ` Eli Chen
  2001-05-14  9:28   ` Gabriel Paubert
  0 siblings, 1 reply; 57+ messages in thread
From: Eli Chen @ 2001-05-12  0:57 UTC (permalink / raw)
  To: Brian Kuschak, 'Gabriel Paubert'
  Cc: 'Dan Malek', linuxppc-embedded


No luck here.  I checked the patched kernel running Brian's test after an
hour and it had frozen up, with one de_put message.

Eli

----- Original Message -----
From: "Brian Kuschak" <brian.kuschak@skystream.com>
To: "'Gabriel Paubert'" <paubert@iram.es>
Cc: "'Dan Malek'" <dan@mvista.com>; "Eli Chen" <eli@routefree.com>;
<linuxppc-embedded@lists.linuxppc.org>
Sent: Friday, May 11, 2001 5:44 PM
Subject: RE: dcache BUG()


> Hi Gabriel,
>
> I tried your patch, but as you expected it didn't fix the problem.  I ran
it
> twice, once with my kernel as I got it from MontaVista, and once with the
> atomic_set() function using regular stw (as it is in the offical sources,
I
> believe).
>
> In both cases I got the de_put messages in a few minutes, and once I got
the
> dentry BUG().
>
> I think Eli might have had better luck, but I'm not sure how long he was
> running the test.
>
> Regards,
> Brian
>
>
>
> > Patch follows, it includes my previous patch. I don't think that it will
> > solve your problem, unless you happen to use a preemptible kernel.
> > While I was at it,  I removed the useless stwcx. in transfer_to_handler.


** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: dcache BUG()
  2001-05-12  0:57 ` Eli Chen
@ 2001-05-14  9:28   ` Gabriel Paubert
  0 siblings, 0 replies; 57+ messages in thread
From: Gabriel Paubert @ 2001-05-14  9:28 UTC (permalink / raw)
  To: Eli Chen; +Cc: Brian Kuschak, 'Dan Malek', linuxppc-embedded

On Fri, 11 May 2001, Eli Chen wrote:

> No luck here.  I checked the patched kernel running Brian's test after an
> hour and it had frozen up, with one de_put message.

Do you have a stack trace or something similar ?

Does my patch increase the time to failure ?

I've not yet got anything better than these messages without any context
(stack trace, etc). I'd prefer to have them with symbols, but even with
only register contents it would be better than nothing.

I'm currently thinking of ways to catch the occurrences of the problem.
The difficuly is that brute force debugging likely won't work on what
looks like a subtle timing problem. So I need a non-invasive check, and
that's far from trivial.

Along with the stack trace, please include as many details as possible on
your system.

	Regards,
	Gabriel.

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2001-05-14  9:28 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2001-05-07 19:04 dcache BUG() Eli Chen
2001-05-07 21:04 ` Dan Malek
2001-05-07 21:17 ` Dan Malek
2001-05-07 21:30   ` Tom Rini
2001-05-07 23:03     ` Dan Malek
2001-05-07 21:47   ` Eli Chen
2001-05-07 23:01     ` Dan Malek
2001-05-07 23:06     ` Gabriel Paubert
2001-05-07 23:15       ` Dan Malek
2001-05-07 23:28         ` Gabriel Paubert
2001-05-07 23:35         ` Eli Chen
2001-05-07 23:36           ` Dan Malek
2001-05-08  0:16             ` Eli Chen
2001-05-08  0:41               ` Dan Malek
2001-05-08  1:14                 ` Eli Chen
2001-05-08  1:11                   ` Dan Malek
2001-05-08 18:01                     ` David Blythe
2001-05-08 20:27                       ` Dan Malek
2001-05-08 21:34                         ` David Blythe
2001-05-08 21:49                           ` Dan Malek
2001-05-08 22:34                             ` Ira Weiny
2001-05-08 22:53                               ` Dan Malek
2001-05-08  1:37             ` Gabriel Paubert
2001-05-08  1:44               ` Dan Malek
2001-05-07 23:40           ` Gabriel Paubert
  -- strict thread matches above, loose matches on Subject: below --
2001-05-12  0:44 Brian Kuschak
2001-05-12  0:57 ` Eli Chen
2001-05-14  9:28   ` Gabriel Paubert
2001-05-10 21:20 Brian Kuschak
2001-05-10 21:26 ` Dan Malek
2001-05-09 16:40 Brian Kuschak
2001-05-09 18:31 ` Dan Malek
2001-05-09 19:18 ` Gabriel Paubert
2001-05-10 18:39   ` Frank Rowand
2001-05-10 18:49     ` Gabriel Paubert
2001-05-10 19:10       ` Frank Rowand
2001-05-11  4:23         ` Paul Mielke
2001-05-11 10:09         ` Gabriel Paubert
2001-05-10 20:56       ` Dan Malek
2001-05-10 23:14         ` Cort Dougan
2001-05-11 11:01           ` Gabriel Paubert
2001-05-11 10:57         ` Gabriel Paubert
2001-05-11 18:49           ` Dan Malek
2001-05-08 17:43 Brian Kuschak
2001-05-09 11:06 ` Gabriel Paubert
2001-05-08 15:43 Brian Kuschak
2001-05-08  3:36 Brian Kuschak
2001-05-08  1:53 Brian Kuschak
2001-05-08  2:03 ` Dan Malek
2001-05-08 11:59 ` Gabriel Paubert
2001-05-08  0:40 Brian Kuschak
2001-05-07 23:01 Brian Kuschak
2001-05-07 22:19 Brian Kuschak
2001-05-07 22:35 ` Cort Dougan
2001-05-07 22:43 ` Eli Chen
2001-05-07 17:21 Brian Kuschak
2001-05-07 20:58 ` Dan Malek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).