[PATCH] fix BUG in tg3

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] fix BUG in tg3_tx
@ 2004-05-24  7:26 Greg Banks
  2004-05-24  7:40 ` David S. Miller
  0 siblings, 1 reply; 24+ messages in thread
From: Greg Banks @ 2004-05-24  7:26 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Network Development List

G'day,

The tg3 transmit code assumes that tg3_tx() will never have to clean
up part of an skb queued for transmit.  This assumption is wrong;
sometimes tg3_tx() will be called partway through an skb, which trips
the first BUG() and drops the machine into kdb.  We've seen this happen
on 2.4 (rarely) and 2.6 (easy to reproduce, just run about 5 minutes'
worth of NFS or Samba load).

This patch against 2.6.6 fixes this problem by changing the shadow
transmit logic to put the non-NULL skb pointer in the *last* ring
entry instead of the first, so that it is freed only after all
parts have been DMAd onto the card, and can handle tg3_tx() being
called partway through.

SGI PV 903520 and PV 891640.



Index: linux/drivers/net/tg3.c
===================================================================
--- linux.orig/drivers/net/tg3.c	Mon May 10 12:32:38 2004
+++ linux/drivers/net/tg3.c	Mon May 24 17:23:12 2004
@@ -2103,10 +2103,6 @@
 	return err;
 }
 
-/* Tigon3 never reports partial packet sends.  So we do not
- * need special logic to handle SKBs that have not had all
- * of their frags sent yet, like SunGEM does.
- */
 static void tg3_tx(struct tg3 *tp)
 {
 	u32 hw_idx = tp->hw_status->idx[0].tx_consumer;
@@ -2115,37 +2111,26 @@
 	while (sw_idx != hw_idx) {
 		struct tx_ring_info *ri = &tp->tx_buffers[sw_idx];
 		struct sk_buff *skb = ri->skb;
-		int i;
-
-		if (unlikely(skb == NULL))
-			BUG();
-
-		pci_unmap_single(tp->pdev,
-				 pci_unmap_addr(ri, mapping),
-				 skb_headlen(skb),
-				 PCI_DMA_TODEVICE);
 
 		ri->skb = NULL;
-
-		sw_idx = NEXT_TX(sw_idx);
-
-		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-			if (unlikely(sw_idx == hw_idx))
-				BUG();
-
-			ri = &tp->tx_buffers[sw_idx];
-			if (unlikely(ri->skb != NULL))
-				BUG();
-
+		
+		if (ri->index == 0) {
+			pci_unmap_single(tp->pdev,
+					 pci_unmap_addr(ri, mapping),
+					 pci_unmap_len(ri, len),
+					 PCI_DMA_TODEVICE);
+		} else {
 			pci_unmap_page(tp->pdev,
 				       pci_unmap_addr(ri, mapping),
-				       skb_shinfo(skb)->frags[i].size,
+				       pci_unmap_len(ri, len),
 				       PCI_DMA_TODEVICE);
-
-			sw_idx = NEXT_TX(sw_idx);
 		}
-
-		dev_kfree_skb_irq(skb);
+		if (skb) {
+			if (unlikely(ri->index != skb_shinfo(skb)->nr_frags))
+				BUG();
+			dev_kfree_skb_irq(skb);
+		}
+		sw_idx = NEXT_TX(sw_idx);
 	}
 
 	tp->tx_cons = sw_idx;
@@ -2605,6 +2590,18 @@
 	schedule_work(&tp->reset_task);
 }
 
+static inline void tg3_set_txri(struct tg3 *tp, int entry, struct sk_buff *skb,
+				int index, dma_addr_t addr, int len)
+{
+	struct tx_ring_info *ri = &tp->tx_buffers[entry];
+
+	ri->skb = (index == skb_shinfo(skb)->nr_frags ? skb : NULL);
+	ri->index = index;
+	pci_unmap_addr_set(ri, mapping, addr);
+	pci_unmap_len_set(ri, len, len);
+}
+
+
 static void tg3_set_txd(struct tg3 *, int, dma_addr_t, int, u32, u32);
 
 static int tigon3_4gb_hwbug_workaround(struct tg3 *tp, struct sk_buff *skb,
@@ -2645,6 +2642,7 @@
 		if (i == 0) {
 			tp->tx_buffers[entry].skb = new_skb;
 			pci_unmap_addr_set(&tp->tx_buffers[entry], mapping, new_addr);
+			pci_unmap_len_set(&tp->tx_buffers[entry], len, new_skb->len);
 		} else {
 			tp->tx_buffers[entry].skb = NULL;
 		}
@@ -2804,8 +2802,7 @@
 	/* Queue skb data, a.k.a. the main skb fragment. */
 	mapping = pci_map_single(tp->pdev, skb->data, len, PCI_DMA_TODEVICE);
 
-	tp->tx_buffers[entry].skb = skb;
-	pci_unmap_addr_set(&tp->tx_buffers[entry], mapping, mapping);
+	tg3_set_txri(tp, entry, skb, 0, mapping, len);
 
 	would_hit_hwbug = 0;
 
@@ -2831,8 +2828,7 @@
 					       frag->page_offset,
 					       len, PCI_DMA_TODEVICE);
 
-			tp->tx_buffers[entry].skb = NULL;
-			pci_unmap_addr_set(&tp->tx_buffers[entry], mapping, mapping);
+			tg3_set_txri(tp, entry, skb, i+1, mapping, len);
 
 			if (tg3_4g_overflow_test(mapping, len)) {
 				/* Only one should match. */
@@ -3002,8 +2998,7 @@
 	/* Queue skb data, a.k.a. the main skb fragment. */
 	mapping = pci_map_single(tp->pdev, skb->data, len, PCI_DMA_TODEVICE);
 
-	tp->tx_buffers[entry].skb = skb;
-	pci_unmap_addr_set(&tp->tx_buffers[entry], mapping, mapping);
+	tg3_set_txri(tp, entry, skb, 0, mapping, len);
 
 	tg3_set_txd(tp, entry, mapping, len, base_flags,
 		    (skb_shinfo(skb)->nr_frags == 0) | (mss << 1));
@@ -3025,8 +3020,7 @@
 					       frag->page_offset,
 					       len, PCI_DMA_TODEVICE);
 
-			tp->tx_buffers[entry].skb = NULL;
-			pci_unmap_addr_set(&tp->tx_buffers[entry], mapping, mapping);
+			tg3_set_txri(tp, entry, skb, i+1, mapping, len);
 
 			tg3_set_txd(tp, entry, mapping, len,
 				    base_flags, (i == last));
Index: linux/drivers/net/tg3.h
===================================================================
--- linux.orig/drivers/net/tg3.h	Mon May 10 12:32:02 2004
+++ linux/drivers/net/tg3.h	Mon May 24 16:58:28 2004
@@ -1774,6 +1774,8 @@
 struct tx_ring_info {
 	struct sk_buff			*skb;
 	DECLARE_PCI_UNMAP_ADDR(mapping)
+	DECLARE_PCI_UNMAP_LEN(len)
+	u32				index;
 	u32				prev_vlan_tag;
 };
 




Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-24  7:26 [PATCH] fix BUG in tg3_tx Greg Banks
@ 2004-05-24  7:40 ` David S. Miller
  2004-05-24  8:04   ` Greg Banks
  0 siblings, 1 reply; 24+ messages in thread
From: David S. Miller @ 2004-05-24  7:40 UTC (permalink / raw)
  To: Greg Banks; +Cc: netdev

On Mon, 24 May 2004 17:26:58 +1000
Greg Banks <gnb@sgi.com> wrote:

> The tg3 transmit code assumes that tg3_tx() will never have to clean
> up part of an skb queued for transmit.  This assumption is wrong;

Greg, perhaps my reading of the tg3 chip docs is different
from yours.  The hardware is NEVER supposed to do this.

Or is there an errata in some chip versions?

I've never triggered that BUG() assertion on any of my
hardware, ever.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-24  7:40 ` David S. Miller
@ 2004-05-24  8:04   ` Greg Banks
  2004-05-24 17:06     ` David S. Miller
  0 siblings, 1 reply; 24+ messages in thread
From: Greg Banks @ 2004-05-24  8:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

On Mon, May 24, 2004 at 12:40:45AM -0700, David S. Miller wrote:
> On Mon, 24 May 2004 17:26:58 +1000
> Greg Banks <gnb@sgi.com> wrote:
> 
> > The tg3 transmit code assumes that tg3_tx() will never have to clean
> > up part of an skb queued for transmit.  This assumption is wrong;
> 
> Greg, perhaps my reading of the tg3 chip docs is different
> from yours.  The hardware is NEVER supposed to do this.

I'd like to know where you read that, because neither I nor any of
the other SGI engineers who have read the Broadcom docs can find any
such guarantee.  The consensus here is that there is no guarantee
about where the transmit ring consumer index is when the card DMAs
a status block update.

We've seen that BUG() trip many times both on 2.4 and 2.6.  SGI ProPack
for Linux has been shipping for a year now with a workaround for this
bug (the BUG() is changed to a break).

> Or is there an errata in some chip versions?

We've seen this on both 5701 and 5704 hardware.  The SGI cards ship
have slightly different firmware from stock Broadcom cards, I don't
know if that's a factor.  It might also be something to do with the
Altix IO architecture.  I don't know why the card chooses to do this,
but it most certainly does.

The IRIX driver guys tells me this same behaviour happens on the
Origin hardware, and the IRIX driver had an equivalent fix applied
about 18 months ago.

> I've never triggered that BUG() assertion on any of my
> hardware, ever.

Ok, here's one of several mostly identical stack traces reported by
various people inside SGI on 2.6 kernels.  If you want to wait for
a day or so I can probably make a 2.4 kernel do this also (I did
that during testing a coupld of days ago but didn't save the stack
trace, doh!)

[root@budgie root]# swapper[0]: bugcheck! 0 [1]

Pid: 0, CPU 0, comm:              swapper
psr : 0000101009022018 ifs : 8000000000000d1e ip  : [<a000000200165980>]
Not tainted
ip is at tg3_tx+0x5a0/0x5c0 [tg3]
unat: 0000000000000000 pfs : 0000000000000d1e rsc : 0000000000000003
rnat: 800000025da66955 bsps: a0000001000fdec0 pr  : 80000000ff7669a5
ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c8a70033f
csd : 0000000000000000 ssd : 0000000000000000
b0  : a000000200165980 b6  : a000000100003320 b7  : a0000001000cc200
f6  : 1003e0fc0fc0fc0fc0fc1 f7  : 0ffdaa200000000000000
f8  : 1003e0000000000000240 f9  : 1003e0000000000002490
f10 : 1003e000000000ea00000 f11 : 1003e00000000367b7ad0
r1  : a0000001009ec7f0 r2  : 0000000000000000 r3  : 0000000000004000
r8  : 0000000000000026 r9  : 0000000000000002 r10 : 0000000000000001
r11 : 0000000000000004 r12 : e0000030146a3c50 r13 : e00000301469c000
r14 : 0000000000004000 r15 : a000000100734fb0 r16 : e00000b004a147a8
r17 : e00000b07ba88060 r18 : e000003007ba0000 r19 : 0000000000180000
r20 : 0000000000000014 r21 : 0000000000080000 r22 : 0000000000100000
r23 : e00000b004a150e4 r24 : e00000b004a150d8 r25 : e0000030146a3bf0
r26 : e00000b004a15810 r27 : 0000000000000074 r28 : 0000000000000074
r29 : e000003007ba002c r30 : e00000b07ba8802c r31 : 0000000000000002

Call Trace:
 [<a000000100015180>] show_stack+0x80/0xa0
                                sp=e0000030146a3820 bsp=e00000301469d2c8
 [<a0000001000384f0>] die+0x1b0/0x280
                                sp=e0000030146a39f0 bsp=e00000301469d2a0
 [<a000000100038960>] ia64_bad_break+0x340/0x480
                                sp=e0000030146a39f0 bsp=e00000301469d280
 [<a00000010000de20>] ia64_leave_kernel+0x0/0x260
                                sp=e0000030146a3a80 bsp=e00000301469d280
 [<a000000200165980>] tg3_tx+0x5a0/0x5c0 [tg3]
                                sp=e0000030146a3c50 bsp=e00000301469d188
 [<a000000200166cf0>] tg3_interrupt_main_work+0x150/0x280 [tg3]
                                sp=e0000030146a3c50 bsp=e00000301469d158
 [<a000000200166f20>] tg3_interrupt+0x100/0x1c0 [tg3]
                                sp=e0000030146a3c50 bsp=e00000301469d110
 [<a000000100011640>] handle_IRQ_event+0xa0/0x120
                                sp=e0000030146a3c50 bsp=e00000301469d0c8
 [<a000000100012170>] do_IRQ+0x390/0x4a0
                                sp=e0000030146a3c50 bsp=e00000301469d078
 [<a000000100014160>] ia64_handle_irq+0xc0/0x1a0
                                sp=e0000030146a3c50 bsp=e00000301469d040
 [<a00000010000de20>] ia64_leave_kernel+0x0/0x260
                                sp=e0000030146a3c50 bsp=e00000301469d040
 [<a000000100084010>] snidle+0xb0/0x180
                                sp=e0000030146a3e20 bsp=e00000301469d038
 [<a000000100015cb0>] cpu_idle+0x130/0x220
                                sp=e0000030146a3e20 bsp=e00000301469cfa8
 [<a000000100630fa0>] start_kernel+0x460/0x4e0
                                sp=e0000030146a3e20 bsp=e00000301469cf50
 [<a000000100008600>] _start+0x2c0/0x0
                                sp=e0000030146a3e30 bsp=e00000301469cf50
 3 out of 4 cpus in kdb, waiting for the rest
1 cpu are not in kdb, their state is unknown

Entering kdb (current=0xe00000301469c000, pid 0) on processor 0 Oops: <NULL>
due to oops @ 0xa000000200165980
 psr: 0x0000101009022018   ifs: 0x8000000000000d1e    ip: 0xa000000200165980  
unat: 0x0000000000000000   pfs: 0x0000000000000d1e   rsc: 0x0000000000000003  
rnat: 0x800000025da66955  bsps: 0xa0000001000fdec0    pr: 0x80000000ff7669a5  
ldrs: 0x0000000000000000   ccv: 0x0000000000000000  fpsr: 0x0009804c8a70033f  
  b0: 0xa000000200165980    b6: 0xa000000100003320    b7: 0xa0000001000cc200  
  r1: 0xa0000001009ec7f0    r2: 0x0000000000000000    r3: 0x0000000000004000  
  r8: 0x0000000000000026    r9: 0x0000000000000002   r10: 0x0000000000000001  
 r11: 0x0000000000000004   r12: 0xe0000030146a3c50   r13: 0xe00000301469c000  
 r14: 0x0000000000004000   r15: 0xa000000100734fb0   r16: 0xe00000b004a147a8  
 r17: 0xe00000b07ba88060   r18: 0xe000003007ba0000   r19: 0x0000000000180000  
 r20: 0x0000000000000014   r21: 0x0000000000080000   r22: 0x0000000000100000  
 r23: 0xe00000b004a150e4   r24: 0xe00000b004a150d8   r25: 0xe0000030146a3bf0  
 r26: 0xe00000b004a15810   r27: 0x0000000000000074   r28: 0x0000000000000074  
 r29: 0xe000003007ba002c   r30: 0xe00000b07ba8802c   r31: 0x0000000000000002  
&regs = e0000030146a3a90
[0]kdb>



Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-24  8:04   ` Greg Banks
@ 2004-05-24 17:06     ` David S. Miller
  2004-05-25  1:04       ` Greg Banks
  0 siblings, 1 reply; 24+ messages in thread
From: David S. Miller @ 2004-05-24 17:06 UTC (permalink / raw)
  To: Greg Banks; +Cc: netdev, mchan

[ Michael, the discussion here is about whether the tigon3 hardware
  ever partially ACK's completion of a multi-frag TX frame.  I
  believe it never does, but Greg claims he can trigger such a case
  and has proposed a patch to the tg3 driver which attempts to handle that. ]

On Mon, 24 May 2004 18:04:31 +1000
Greg Banks <gnb@sgi.com> wrote:

> On Mon, May 24, 2004 at 12:40:45AM -0700, David S. Miller wrote:
> > On Mon, 24 May 2004 17:26:58 +1000
> > Greg Banks <gnb@sgi.com> wrote:
> > 
> > > The tg3 transmit code assumes that tg3_tx() will never have to clean
> > > up part of an skb queued for transmit.  This assumption is wrong;
> > 
> > Greg, perhaps my reading of the tg3 chip docs is different
> > from yours.  The hardware is NEVER supposed to do this.
> 
> I'd like to know where you read that, because neither I nor any of
> the other SGI engineers who have read the Broadcom docs can find any
> such guarantee.

The most relevant (and accurate) piece of documentation for the chip
is Broadcom's own driver :-) And they do not account at all for such
a case of partial-packet TX completion indication.  If the first frag
is ACK'd they assume the whole packet has been taken.  Here is the
relevant code from the bcm5700 driver in LM_ServiceTxInterrupt():

    while(SwConIdx != HwConIdx)
    {
        pPacket = pDevice->SendRing[SwConIdx];
        pDevice->SendRing[SwConIdx] = 0;

        /* Set the return status. */
        pPacket->PacketStatus = LM_STATUS_SUCCESS;

        /* Put the packet in the TxPacketXmittedQ for indication later. */
        QQ_PushTail(&pDevice->TxPacketXmittedQ.Container, pPacket);

        /* Move to the next packet's BD. */
        SwConIdx = (SwConIdx + pPacket->u.Tx.FragCount) & 
            T3_SEND_RCB_ENTRY_COUNT_MASK;

        /* Update the number of unused BDs. */
        MM_ATOMIC_ADD(&pDevice->SendBdLeft, pPacket->u.Tx.FragCount);

        /* Get the new updated HwConIdx. */
        HwConIdx = pDevice->pStatusBlkVirt->Idx[0].SendConIdx;
    } /* while */

Imagine how badly this piece of code would fail if partial ACK'ing of
TX packets actually occurred.  It would loop past HwConIdx and thus
ACK really-not-completed packets, potentially colliding with what
the chip is transmitting and thus causing massive data corruption
and likely a crash.  Actually, it would most likely loop past all
valid TX packets and dereference a pPacket NULL pointer.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-24 17:06     ` David S. Miller
@ 2004-05-25  1:04       ` Greg Banks
  2004-05-25 17:51         ` David S. Miller
  2004-05-25 17:52         ` David S. Miller
  0 siblings, 2 replies; 24+ messages in thread
From: Greg Banks @ 2004-05-25  1:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, mchan

On Mon, May 24, 2004 at 10:06:34AM -0700, David S. Miller wrote:
> 
> The most relevant (and accurate) piece of documentation for the chip
> is Broadcom's own driver :-)

The Windows-flavoured driver you decided to replace in Linux?

> And they do not account at all for such
> a case of partial-packet TX completion indication.  If the first frag
> is ACK'd they assume the whole packet has been taken.  Here is the
> relevant code from the bcm5700 driver in LM_ServiceTxInterrupt():
> 
>     while(SwConIdx != HwConIdx)
>     {
>         pPacket = pDevice->SendRing[SwConIdx];
>         pDevice->SendRing[SwConIdx] = 0;
> 
>         /* Set the return status. */
>         pPacket->PacketStatus = LM_STATUS_SUCCESS;
> 
>         /* Put the packet in the TxPacketXmittedQ for indication later. */
>         QQ_PushTail(&pDevice->TxPacketXmittedQ.Container, pPacket);
> 
>         /* Move to the next packet's BD. */
>         SwConIdx = (SwConIdx + pPacket->u.Tx.FragCount) & 
>             T3_SEND_RCB_ENTRY_COUNT_MASK;
> 
>         /* Update the number of unused BDs. */
>         MM_ATOMIC_ADD(&pDevice->SendBdLeft, pPacket->u.Tx.FragCount);
> 
>         /* Get the new updated HwConIdx. */
>         HwConIdx = pDevice->pStatusBlkVirt->Idx[0].SendConIdx;
>     } /* while */
> 
> Imagine how badly this piece of code would fail if partial ACK'ing of
> TX packets actually occurred.  It would loop past HwConIdx and thus
> ACK really-not-completed packets, potentially colliding with what
> the chip is transmitting and thus causing massive data corruption
> and likely a crash.  Actually, it would most likely loop past all
> valid TX packets and dereference a pPacket NULL pointer.

I agree that this code appears to implictly rely on always getting
complete send ring updates.

So has this driver been tested in 2.6?  I did notice that the bug
occurs far more frequently in 2.6 because better zero-copy code
means that the driver actually sees skbs with frags instead of just
the header.  The driver might be accidentally working because
pPacket->u.Tx.FragCount=1 during all its testing.

Also, I'm not familiar with this driver's source (I haven't looked
at it for a long time), but I can see that there are behaviour
differences between this driver and the tg3 driver which might affect
the visibility of the bug.

For one thing, this code fetches a new sample of the hardware consumer
index on every loop iteration.  This may result in accidentally
bumping up HwConIdx to avoid apparent partial completions.  Also note
that the queuing step (QQ_PushTail) might add enough delay to make
a second status block update more likely.

Without looking at the remainder of the driver, I can't say if this
code is called more or less frequently than tg3_tx().  If it's using
any interrupt coalescing at all it will be called much less frequently
and thus have a smaller window to see a partial complete.  On my
hardware I see interrupt rates of 10000-30000 per second per card,
and tracing shows that tg3_tx() is called on most of these interrupts.
It could be the case that the huge interrupt rate in the tg3 driver
is banging hard on a race condition which the bcom driver avoids.

Also, the NAPI code in the tg3 driver will presumably set up the
card's interrupt coalescing engine with different parameters, which
might have an effect on the timing of status block updates.

Finally, the tg3 driver departs from the recommended ISR flow control
diagram and handles a status block update during the ISR by using
the SETINT bit to tell the card to force a new interrupt (instead of
restarting the ISR).  This might also have an effect.

In short, even if the implicit assumptions in the bcom driver are
correct in that driver, I don't see how you can argue that they can
be carried across to the tg3 driver.

BTW, at least one other person has reported what appears to be the
exactly this bug:

http://marc.theaimsgroup.com/?l=linux-kernel&m=102822850329939&w=2

He found the same bug in both bcom and tg3 drivers, and his hardware
has little in common with mine (apart from having >1 fast CPUs).

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-25  1:04       ` Greg Banks
@ 2004-05-25 17:51         ` David S. Miller
  2004-05-25 19:20           ` [PATCH] tg3 h/w flow control autoneg Arthur Kepner
  2004-05-26  0:12           ` [PATCH] fix BUG in tg3_tx Greg Banks
  2004-05-25 17:52         ` David S. Miller
  1 sibling, 2 replies; 24+ messages in thread
From: David S. Miller @ 2004-05-25 17:51 UTC (permalink / raw)
  To: Greg Banks; +Cc: netdev, mchan

On Tue, 25 May 2004 11:04:34 +1000
Greg Banks <gnb@sgi.com> wrote:

> I agree that this code appears to implictly rely on always getting
> complete send ring updates.

Greg, did you see Micahel Chan's response?  A Broadcom engineer
is telling us "the hardware does not ACK partial TX packets."

I can't think of a more reliable source for this kind of information,
can you?  Given this, it doesn't matter all of the difference you
mention between the tg3 and bcm5700 driver, the hardware simply is
never supposed to do this as stated by somehow who has access to
the actual hardware engineers. :-)

I don't argue that you aren't seeing something strange, but perhaps
that is due to corruption occuring elsewhere, or perhaps something
peculiar about your system hardware (perhaps the PCI controller
mis-orders PCI transactions or something silly like that)?

Have you reproduced this on some system other than these huge SGI
ones?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH] tg3 h/w flow control autoneg
  2004-05-25 17:51         ` David S. Miller
@ 2004-05-25 19:20           ` Arthur Kepner
  2004-05-25 20:01             ` David S. Miller
  2004-05-26  0:12           ` [PATCH] fix BUG in tg3_tx Greg Banks
  1 sibling, 1 reply; 24+ messages in thread
From: Arthur Kepner @ 2004-05-25 19:20 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 348 bytes --]


We found that packets were mysteriously being discarded by
our Broadcom 570X hardware under heavy receive load. This
occurred especially when multiple NICs were in use.

The problem turned out to be that h/w flow control wasn't
being used, even after that should have been autonegotiated.
The attached patch (against 2.6.6) fixes that.

--

Arthur

[-- Attachment #2: tg3 h/w flow control autoneg patch --]
[-- Type: TEXT/PLAIN, Size: 924 bytes --]

--- linux.orig/drivers/net/tg3.c	2004-05-25 11:26:03.000000000 -0700
+++ linux-2.6.6/drivers/net/tg3.c	2004-05-25 11:38:13.000000000 -0700
@@ -1027,6 +1027,8 @@
 static void tg3_setup_flow_control(struct tg3 *tp, u32 local_adv, u32 remote_adv)
 {
 	u32 new_tg3_flags = 0;
+	u32 old_rx_mode = tp->rx_mode;
+	u32 old_tx_mode = tp->tx_mode;
 
 	if (local_adv & ADVERTISE_PAUSE_CAP) {
 		if (local_adv & ADVERTISE_PAUSE_ASYM) {
@@ -1057,10 +1059,18 @@
 	else
 		tp->rx_mode &= ~RX_MODE_FLOW_CTRL_ENABLE;
 
+	if (old_rx_mode != tp->rx_mode) {
+		tw32_f(MAC_RX_MODE, tp->rx_mode);
+	}
+	
 	if (new_tg3_flags & TG3_FLAG_TX_PAUSE)
 		tp->tx_mode |= TX_MODE_FLOW_CTRL_ENABLE;
 	else
 		tp->tx_mode &= ~TX_MODE_FLOW_CTRL_ENABLE;
+
+	if (old_tx_mode != tp->tx_mode) {
+		tw32_f(MAC_TX_MODE, tp->tx_mode);
+	}
 }
 
 static void tg3_aux_stat_to_speed_duplex(struct tg3 *tp, u32 val, u16 *speed, u8 *duplex)

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] tg3 h/w flow control autoneg
  2004-05-25 19:20           ` [PATCH] tg3 h/w flow control autoneg Arthur Kepner
@ 2004-05-25 20:01             ` David S. Miller
  0 siblings, 0 replies; 24+ messages in thread
From: David S. Miller @ 2004-05-25 20:01 UTC (permalink / raw)
  To: Arthur Kepner; +Cc: netdev

On Tue, 25 May 2004 12:20:05 -0700
Arthur Kepner <akepner@sgi.com> wrote:

> The problem turned out to be that h/w flow control wasn't
> being used, even after that should have been autonegotiated.
> The attached patch (against 2.6.6) fixes that.

What an embarassing bug.  Patch applied, good spotting Arthur.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-25 17:51         ` David S. Miller
  2004-05-25 19:20           ` [PATCH] tg3 h/w flow control autoneg Arthur Kepner
@ 2004-05-26  0:12           ` Greg Banks
  1 sibling, 0 replies; 24+ messages in thread
From: Greg Banks @ 2004-05-26  0:12 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, mchan

On Tue, May 25, 2004 at 10:51:01AM -0700, David S. Miller wrote:
> On Tue, 25 May 2004 11:04:34 +1000
> Greg Banks <gnb@sgi.com> wrote:
> 
> > I agree that this code appears to implictly rely on always getting
> > complete send ring updates.
> 
> Greg, did you see Micahel Chan's response?  A Broadcom engineer
> is telling us "the hardware does not ACK partial TX packets."

Yes I did.  I've been working towards gathering data for a reply.

> I can't think of a more reliable source for this kind of information,
> can you?

I can think of one: actual observation of the card in action in the
field.  Experiment trumps theory.

To this end, I instrumented the driver + my patch to BUG() out if
the tx_ring_info.index is not a predicted value, i.e. if the tg3_tx()
ever starts partway through a packet.  It's been running overnight
under >200 MB/s of NFS read load, nothing yet.

> I don't argue that you aren't seeing something strange, but perhaps
> that is due to corruption occuring elsewhere, or perhaps something
> peculiar about your system hardware (perhaps the PCI controller
> mis-orders PCI transactions or something silly like that)?

There are many things peculiar about our hardware.  Otherwise we'd
be "the world stops at 4 processors" Dell.

> Have you reproduced this on some system other than these huge SGI
> ones?

I haven't tried; my job is first and foremost to make SGI hardware
work.  However I did point you to a report on lkml where someone on
non-SGI hardware has seen what appears to be the same problem.  I'm not
yet willing to consign this to the "wacky SGI PCI hardware" bucket.

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-25  1:04       ` Greg Banks
  2004-05-25 17:51         ` David S. Miller
@ 2004-05-25 17:52         ` David S. Miller
  1 sibling, 0 replies; 24+ messages in thread
From: David S. Miller @ 2004-05-25 17:52 UTC (permalink / raw)
  To: Greg Banks; +Cc: netdev, mchan


So, BTW, I'm not putting in a workaround for the partial-TX-packet
ACK in tg3_tx() until we figure out why this is happening for Greg.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] fix BUG in tg3_tx
@ 2004-05-24 17:26 Michael Chan
  0 siblings, 0 replies; 24+ messages in thread
From: Michael Chan @ 2004-05-24 17:26 UTC (permalink / raw)
  To: David S. Miller, Greg Banks; +Cc: netdev

David,

The producer index of completed tx packets in the status block will
always be at whole packet boundaries (1 + the index of the completed
packet's last fragment). Even if it is a TSO packet, it will be at the
boundaries of entire TSO packets.

Michael

> -----Original Message-----
> From: David S. Miller [mailto:davem@redhat.com] 
> Sent: Monday, May 24, 2004 10:07 AM
> To: Greg Banks
> Cc: netdev@oss.sgi.com; Michael Chan
> Subject: Re: [PATCH] fix BUG in tg3_tx
> 
> 
> 
> [ Michael, the discussion here is about whether the tigon3 hardware
>   ever partially ACK's completion of a multi-frag TX frame.  I
>   believe it never does, but Greg claims he can trigger such a case
>   and has proposed a patch to the tg3 driver which attempts 
> to handle that. ]
> 
> On Mon, 24 May 2004 18:04:31 +1000
> Greg Banks <gnb@sgi.com> wrote:
> 
> > On Mon, May 24, 2004 at 12:40:45AM -0700, David S. Miller wrote:
> > > On Mon, 24 May 2004 17:26:58 +1000
> > > Greg Banks <gnb@sgi.com> wrote:
> > > 
> > > > The tg3 transmit code assumes that tg3_tx() will never have to 
> > > > clean up part of an skb queued for transmit.  This 
> assumption is 
> > > > wrong;
> > > 
> > > Greg, perhaps my reading of the tg3 chip docs is different from 
> > > yours.  The hardware is NEVER supposed to do this.
> > 
> > I'd like to know where you read that, because neither I nor 
> any of the 
> > other SGI engineers who have read the Broadcom docs can 
> find any such 
> > guarantee.
> 
> The most relevant (and accurate) piece of documentation for 
> the chip is Broadcom's own driver :-) And they do not account 
> at all for such a case of partial-packet TX completion 
> indication.  If the first frag is ACK'd they assume the whole 
> packet has been taken.  Here is the relevant code from the 
> bcm5700 driver in LM_ServiceTxInterrupt():
> 
>     while(SwConIdx != HwConIdx)
>     {
>         pPacket = pDevice->SendRing[SwConIdx];
>         pDevice->SendRing[SwConIdx] = 0;
> 
>         /* Set the return status. */
>         pPacket->PacketStatus = LM_STATUS_SUCCESS;
> 
>         /* Put the packet in the TxPacketXmittedQ for 
> indication later. */
>         QQ_PushTail(&pDevice->TxPacketXmittedQ.Container, pPacket);
> 
>         /* Move to the next packet's BD. */
>         SwConIdx = (SwConIdx + pPacket->u.Tx.FragCount) & 
>             T3_SEND_RCB_ENTRY_COUNT_MASK;
> 
>         /* Update the number of unused BDs. */
>         MM_ATOMIC_ADD(&pDevice->SendBdLeft, pPacket->u.Tx.FragCount);
> 
>         /* Get the new updated HwConIdx. */
>         HwConIdx = pDevice->pStatusBlkVirt->Idx[0].SendConIdx;
>     } /* while */
> 
> Imagine how badly this piece of code would fail if partial 
> ACK'ing of TX packets actually occurred.  It would loop past 
> HwConIdx and thus ACK really-not-completed packets, 
> potentially colliding with what the chip is transmitting and 
> thus causing massive data corruption and likely a crash.  
> Actually, it would most likely loop past all valid TX packets 
> and dereference a pPacket NULL pointer.
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] fix BUG in tg3_tx
@ 2004-05-24 17:33 Michael Chan
  0 siblings, 0 replies; 24+ messages in thread
From: Michael Chan @ 2004-05-24 17:33 UTC (permalink / raw)
  To: David S. Miller, Greg Banks; +Cc: netdev

> David,
> 
> The producer index of completed tx packets in the status 
> block will always be at whole packet boundaries (1 + the 
> index of the completed packet's last fragment). Even if it is 
> a TSO packet, it will be at the boundaries of entire TSO packets.
> 
> Michael
> 

Minor correction - I should have said the send BD consumer index instead
of the producer index.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] fix BUG in tg3_tx
@ 2004-05-25 20:04 Michael Chan
  2004-05-26  0:54 ` Greg Banks
  2004-05-26 16:04 ` Greg Banks
  0 siblings, 2 replies; 24+ messages in thread
From: Michael Chan @ 2004-05-25 20:04 UTC (permalink / raw)
  To: David S. Miller, Greg Banks; +Cc: netdev

> Greg, did you see Micahel Chan's response?  A Broadcom 
> engineer is telling us "the hardware does not ACK partial TX packets."
> 
That's right. The hw is designed to always complete tx packets on packet
boundaries and not BD boundaries. The send data completion state machine
will create 1 single dma descriptor and 1 host coalescing descriptor for
the entire packet. All of our drivers do not handle individual BD
completions and I'm not aware of any problems caused by this. Actually
we did see some partial packet completions during the early
implementions of TSO/LSO. But those were firmware issues and have been
fixed long time ago. tg3 is not using those early TSO firmware.

> I don't argue that you aren't seeing something strange, but 
> perhaps that is due to corruption occuring elsewhere, or 
> perhaps something peculiar about your system hardware 
> (perhaps the PCI controller mis-orders PCI transactions or 
> something silly like that)?
Good point. A few years ago we saw cases where there were tx completions
on BDs that had not been sent. It turned out that on that machine, the
chipset was re-ordering the posted mmio writes to the send mailbox
register from 2 CPUs. For example, CPU 1 wrote index 1 and CPU wrote
index 2 a little later. On the PCI bus, we saw memory write of 2
followed by 1. When the chip saw 2, it would send both packets. When it
later saw 1, it thought that there were 512 new tx BDs and went ahead to
send them. The only effective workaround for this chipset problem was a
read of the send mailbox after the write to flush it.

Michael

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-25 20:04 Michael Chan
@ 2004-05-26  0:54 ` Greg Banks
  2004-05-26 16:04 ` Greg Banks
  1 sibling, 0 replies; 24+ messages in thread
From: Greg Banks @ 2004-05-26  0:54 UTC (permalink / raw)
  To: Michael Chan; +Cc: David S. Miller, netdev

On Tue, May 25, 2004 at 01:04:24PM -0700, Michael Chan wrote:
> > Greg, did you see Micahel Chan's response?  A Broadcom 
> > engineer is telling us "the hardware does not ACK partial TX packets."
> > 
> That's right. The hw is designed to always complete tx packets on packet
> boundaries and not BD boundaries. The send data completion state machine
> will create 1 single dma descriptor and 1 host coalescing descriptor for
> the entire packet. All of our drivers do not handle individual BD
> completions and I'm not aware of any problems caused by this. Actually
> we did see some partial packet completions during the early
> implementions of TSO/LSO. But those were firmware issues and have been
> fixed long time ago. tg3 is not using those early TSO firmware.

I believe the SGI-branded cards ship with firmware fixes beyond simply
changing the PCI ids.  Also, AFAIK it dates from about the time of the
TSO experiments.  Can you check if that firmware has the issue you
describe?

> > I don't argue that you aren't seeing something strange, but 
> > perhaps that is due to corruption occuring elsewhere, or 
> > perhaps something peculiar about your system hardware 
> > (perhaps the PCI controller mis-orders PCI transactions or 
> > something silly like that)?
> Good point. A few years ago we saw cases where there were tx completions
> on BDs that had not been sent. It turned out that on that machine, the
> chipset was re-ordering the posted mmio writes to the send mailbox
> register from 2 CPUs. For example, CPU 1 wrote index 1 and CPU wrote
> index 2 a little later. On the PCI bus, we saw memory write of 2
> followed by 1. When the chip saw 2, it would send both packets. When it
> later saw 1, it thought that there were 512 new tx BDs and went ahead to
> send them. The only effective workaround for this chipset problem was a
> read of the send mailbox after the write to flush it.

The tg3 driver already does this if the TG3_FLAG_MBOX_WRITE_REORDER
flag is set in tp->tg3_flags.  There's been some discussion inside
SGI about that behaviour.  In short, our PCI hardware is susceptible
to PIO write reordering, but experiment has shown that enabling that
flag results in an unacceptable throughput degradation (about 10%).

I have also noticed that under significant load the softirq portion
of the driver gets scheduled on other CPUs than the interrupt CPU,
including CPUs in other NUMA nodes.

This sounds like a theory I can test.

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-25 20:04 Michael Chan
  2004-05-26  0:54 ` Greg Banks
@ 2004-05-26 16:04 ` Greg Banks
  2004-05-26 18:01   ` David S. Miller
  1 sibling, 1 reply; 24+ messages in thread
From: Greg Banks @ 2004-05-26 16:04 UTC (permalink / raw)
  To: Michael Chan; +Cc: David S. Miller, netdev

On Tue, May 25, 2004 at 01:04:24PM -0700, Michael Chan wrote:
> [...] A few years ago we saw cases where there were tx completions
> on BDs that had not been sent. It turned out that on that machine, the
> chipset was re-ordering the posted mmio writes to the send mailbox
> register from 2 CPUs.[...]

On a related note, is there a good reason why the tg3 driver uses
the on-chip SRAM send ring by default instead of the host send ring?
This seems like it would dramatically increase the PIO load on the
chipset for some of the workloads I'm interested in.

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 16:04 ` Greg Banks
@ 2004-05-26 18:01   ` David S. Miller
  2004-05-26 23:47     ` Greg Banks
  0 siblings, 1 reply; 24+ messages in thread
From: David S. Miller @ 2004-05-26 18:01 UTC (permalink / raw)
  To: Greg Banks; +Cc: mchan, netdev

On Thu, 27 May 2004 02:04:43 +1000
Greg Banks <gnb@sgi.com> wrote:

> On a related note, is there a good reason why the tg3 driver uses
> the on-chip SRAM send ring by default instead of the host send ring?
> This seems like it would dramatically increase the PIO load on the
> chipset for some of the workloads I'm interested in.

Good question.

It actually results in better performance to use PIOs to the
chip to write the TXD descriptors.  You may be skeptical about
this but it cannot be denied that it does result in lower
latency as we don't have to wait for the chip to do it's next
prefetch and _furthermore_ this means that no CPU cache lines
will bounce from cpu-->device in order to get the descriptors
to the chip.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 18:01   ` David S. Miller
@ 2004-05-26 23:47     ` Greg Banks
  2004-05-26 23:52       ` David S. Miller
  0 siblings, 1 reply; 24+ messages in thread
From: Greg Banks @ 2004-05-26 23:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: mchan, netdev

On Wed, May 26, 2004 at 11:01:21AM -0700, David S. Miller wrote:
> On Thu, 27 May 2004 02:04:43 +1000
> Greg Banks <gnb@sgi.com> wrote:
> 
> > [...] is there a good reason why the tg3 driver uses
> > the on-chip SRAM send ring by default instead of the host send
> > ring?[...]
> 
> It actually results in better performance to use PIOs to the
> chip to write the TXD descriptors.  You may be skeptical about
> this but it cannot be denied that it does result in lower
> latency as we don't have to wait for the chip to do it's next
> prefetch and _furthermore_ this means that no CPU cache lines
> will bounce from cpu-->device in order to get the descriptors
> to the chip.

Actually I am skeptical.  I suspect the performance difference
is dependent on chipset and load.

In the case I'm looking at (multiple NIC NFS read loads) there would be
7 to 10 32-bit PIOs emitted per call to tg3_start_xmit.  With 3 NICs'
worth of near line-rate traffic going through one chipset, that's a
lot of PIOs.  The scaling work we're doing will require 2 to 3 times
more traffic than this.  For this kind of load the latency cost may
be worth the efficiency gain for the chipset.

If we can show a performance improvement on our hardware, would you
accept a patch to enable host send rings on our hardware only?

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 23:47     ` Greg Banks
@ 2004-05-26 23:52       ` David S. Miller
  2004-05-27  0:12         ` Greg Banks
  0 siblings, 1 reply; 24+ messages in thread
From: David S. Miller @ 2004-05-26 23:52 UTC (permalink / raw)
  To: Greg Banks; +Cc: mchan, netdev

On Thu, 27 May 2004 09:47:33 +1000
Greg Banks <gnb@sgi.com> wrote:

> If we can show a performance improvement on our hardware, would you
> accept a patch to enable host send rings on our hardware only?

Did you read my other email?  I already made the change globally
in my sources.  tg3 will always use host send rings now.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 23:52       ` David S. Miller
@ 2004-05-27  0:12         ` Greg Banks
  0 siblings, 0 replies; 24+ messages in thread
From: Greg Banks @ 2004-05-27  0:12 UTC (permalink / raw)
  To: David S. Miller; +Cc: mchan, netdev

On Wed, May 26, 2004 at 04:52:38PM -0700, David S. Miller wrote:
> On Thu, 27 May 2004 09:47:33 +1000
> Greg Banks <gnb@sgi.com> wrote:
> 
> > If we can show a performance improvement on our hardware, would you
> > accept a patch to enable host send rings on our hardware only?
> 
> Did you read my other email?  I already made the change globally
> in my sources.  tg3 will always use host send rings now.

Sorry, I'm using a crappy text mail tool over dialup at the moment
which makes it hard to scan ahead for complete state before replying.

I'm very happy about the change, thanks!

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] fix BUG in tg3_tx
@ 2004-05-26  1:22 Michael Chan
  0 siblings, 0 replies; 24+ messages in thread
From: Michael Chan @ 2004-05-26  1:22 UTC (permalink / raw)
  To: Greg Banks; +Cc: David S. Miller, netdev


> 
> I believe the SGI-branded cards ship with firmware fixes 
> beyond simply changing the PCI ids.  Also, AFAIK it dates 
> from about the time of the TSO experiments.  Can you check if 
> that firmware has the issue you describe?
> 

TSO firmware is downloaded by the driver and not shipped with the card.
tg3 is using TSO firmware version 1.4.0 which is the latest for 5703 and
5704.

Michael

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] fix BUG in tg3_tx
@ 2004-05-26 17:43 Michael Chan
  2004-05-26 18:47 ` David S. Miller
  2004-05-26 23:50 ` Greg Banks
  0 siblings, 2 replies; 24+ messages in thread
From: Michael Chan @ 2004-05-26 17:43 UTC (permalink / raw)
  To: Greg Banks; +Cc: David S. Miller, netdev

> On a related note, is there a good reason why the tg3 driver 
> uses the on-chip SRAM send ring by default instead of the 
> host send ring? This seems like it would dramatically 
> increase the PIO load on the chipset for some of the 
> workloads I'm interested in.
> 

I can only speak for the Broadcom bcm5700 driver. We used to use NIC
send BDs by default before zero copy transmit and TSO were implemented
in the kernel. Using only one BD per packet at that time, we found that
performance on some machines were sometimes slightly better. Especially
with logic to save some PIO when some of the fields in the BD have not
changed. The driver has now been changed to use host send BDs to perform
better with zero copy and especially TSO where you may need many BDs per
packet. I would recommend tg3 to make the switch also.

Michael

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 17:43 Michael Chan
@ 2004-05-26 18:47 ` David S. Miller
  2004-05-26 23:52   ` Greg Banks
  2004-05-26 23:50 ` Greg Banks
  1 sibling, 1 reply; 24+ messages in thread
From: David S. Miller @ 2004-05-26 18:47 UTC (permalink / raw)
  To: Michael Chan; +Cc: gnb, netdev

On Wed, 26 May 2004 10:43:10 -0700
"Michael Chan" <mchan@broadcom.com> wrote:

> I would recommend tg3 to make the switch also.

Ok, I'll make it use DMA tx descriptors by default.

The checking for cases where we can't do on-chip TX desciptors
I'll keep around (albeit commented out) just in case we allow
this again in the future.

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/05/26 11:46:50-07:00 davem@nuts.davemloft.net 
#   [TG3]: Use HOST TXDs always.
# 
# drivers/net/tg3.c
#   2004/05/26 11:46:42-07:00 davem@nuts.davemloft.net +9 -0
#   [TG3]: Use HOST TXDs always.
# 
diff -Nru a/drivers/net/tg3.c b/drivers/net/tg3.c
--- a/drivers/net/tg3.c	2004-05-26 11:46:57 -07:00
+++ b/drivers/net/tg3.c	2004-05-26 11:46:57 -07:00
@@ -7534,6 +7534,14 @@
 	udelay(50);
 	tg3_nvram_init(tp);
 
+	/* Always use host TXDs, it performs better in particular
+	 * with multi-frag packets.  The tests below are kept here
+	 * as documentation should we change this decision again
+	 * in the future.
+	 */
+	tp->tg3_flags |= TG3_FLAG_HOST_TXDS;
+
+#if 0
 	/* Determine if TX descriptors will reside in
 	 * main memory or in the chip SRAM.
 	 */
@@ -7541,6 +7549,7 @@
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5705 ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5750)
 		tp->tg3_flags |= TG3_FLAG_HOST_TXDS;
+#endif
 
 	grc_misc_cfg = tr32(GRC_MISC_CFG);
 	grc_misc_cfg &= GRC_MISC_CFG_BOARD_ID_MASK;

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 18:47 ` David S. Miller
@ 2004-05-26 23:52   ` Greg Banks
  0 siblings, 0 replies; 24+ messages in thread
From: Greg Banks @ 2004-05-26 23:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: Michael Chan, netdev

On Wed, May 26, 2004 at 11:47:14AM -0700, David S. Miller wrote:
> On Wed, 26 May 2004 10:43:10 -0700
> "Michael Chan" <mchan@broadcom.com> wrote:
> 
> > I would recommend tg3 to make the switch also.
> 
> Ok, I'll make it use DMA tx descriptors by default.

Cool!

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix BUG in tg3_tx
  2004-05-26 17:43 Michael Chan
  2004-05-26 18:47 ` David S. Miller
@ 2004-05-26 23:50 ` Greg Banks
  1 sibling, 0 replies; 24+ messages in thread
From: Greg Banks @ 2004-05-26 23:50 UTC (permalink / raw)
  To: Michael Chan; +Cc: David S. Miller, netdev

On Wed, May 26, 2004 at 10:43:10AM -0700, Michael Chan wrote:
> 
> > [...] is there a good reason why the tg3 driver 
> > uses the on-chip SRAM send ring by default instead of the 
> > host send ring?[...]
> 
> I can only speak for the Broadcom bcm5700 driver. We used to use NIC
> send BDs by default before zero copy transmit and TSO were implemented
> in the kernel. Using only one BD per packet at that time, we found that
> performance on some machines were sometimes slightly better. Especially
> with logic to save some PIO when some of the fields in the BD have not
> changed. The driver has now been changed to use host send BDs to perform
> better with zero copy and especially TSO where you may need many BDs per
> packet. I would recommend tg3 to make the switch also.

Ah, it's precisely the zero copy case I'm interested in.  I've measured
2 to 3 BDs per packet under my load.

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2004-05-27  0:12 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-05-24  7:26 [PATCH] fix BUG in tg3_tx Greg Banks
2004-05-24  7:40 ` David S. Miller
2004-05-24  8:04   ` Greg Banks
2004-05-24 17:06     ` David S. Miller
2004-05-25  1:04       ` Greg Banks
2004-05-25 17:51         ` David S. Miller
2004-05-25 19:20           ` [PATCH] tg3 h/w flow control autoneg Arthur Kepner
2004-05-25 20:01             ` David S. Miller
2004-05-26  0:12           ` [PATCH] fix BUG in tg3_tx Greg Banks
2004-05-25 17:52         ` David S. Miller
  -- strict thread matches above, loose matches on Subject: below --
2004-05-24 17:26 Michael Chan
2004-05-24 17:33 Michael Chan
2004-05-25 20:04 Michael Chan
2004-05-26  0:54 ` Greg Banks
2004-05-26 16:04 ` Greg Banks
2004-05-26 18:01   ` David S. Miller
2004-05-26 23:47     ` Greg Banks
2004-05-26 23:52       ` David S. Miller
2004-05-27  0:12         ` Greg Banks
2004-05-26  1:22 Michael Chan
2004-05-26 17:43 Michael Chan
2004-05-26 18:47 ` David S. Miller
2004-05-26 23:52   ` Greg Banks
2004-05-26 23:50 ` Greg Banks

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).