netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Fix e100 on systems that have cache incoherent DMA
@ 2007-11-08 18:17 Auke Kok
  2007-11-28 19:12 ` David Acker
  0 siblings, 1 reply; 28+ messages in thread
From: Auke Kok @ 2007-11-08 18:17 UTC (permalink / raw)
  To: jeff, akpm
  Cc: netdev, dacker, auke-jan.h.kok, jesse.brandeburg, miltonm,
	e1000-devel

From: David Acker <dacker@roinet.com>

On the systems that have cache incoherent DMA, including ARM, there
is a race condition between software allocating a new receive buffer
and hardware writing into a buffer.  The two race on touching the last
Receive Frame Descriptor (RFD).  It has its el-bit set and its next
link equal to 0.  When hardware encounters this buffer it attempts to
write data to it and then update Status Word bits and Actual Count in
the RFD.  At the same time software may try to clear the el-bit and
set the link address to a new buffer.

Since the entire RFD is once cache-line, the two write operations can
collide.  This can lead to the receive unit stalling or interpreting
random memory as its receive area.

The fix is to set the el-bit on and the size to 0 on the next to last
buffer in the chain.  When the hardware encounters this buffer it stops
and does not write to it at all.  The hardware issues an RNR interrupt
with the receive unit in the No Resources state.  Software can write
to the tail of the list because it knows hardware will stop on the
previous descriptor that was marked as the end of list.

Once it has a new next to last buffer prepared, it can clear the el-bit
and set the size on the previous one.  The race on this buffer is safe
since the link already points to a valid next buffer and the software
can handle the race setting the size (assuming aligned 16 bit writes
are atomic with respect to the DMA read). If the hardware sees the
el-bit cleared without the size set, it will move on to the next buffer
and skip this one.  If it sees the size set but the el-bit still set,
it will complete that buffer and then RNR interrupt and wait.

Signed-off-by: David Acker <dacker@roinet.com>
Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
---

 drivers/net/e100.c |  128 ++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 3dbaec6..2153058 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -106,6 +106,13 @@
  *	the RFD, the RFD must be dma_sync'ed to maintain a consistent
  *	view from software and hardware.
  *
+ *	In order to keep updates to the RFD link field from colliding with
+ *	hardware writes to mark packets complete, we use the feature that
+ *	hardware will not write to a size 0 descriptor and mark the previous
+ *	packet as end-of-list (EL).   After updating the link, we remove EL
+ *	and only then restore the size such that hardware may use the
+ *	previous-to-end RFD.
+ *
  *	Under typical operation, the  receive unit (RU) is start once,
  *	and the controller happily fills RFDs as frames arrive.  If
  *	replacement RFDs cannot be allocated, or the RU goes non-active,
@@ -281,6 +288,7 @@ struct csr {
 };
 
 enum scb_status {
+	rus_no_res       = 0x08,
 	rus_ready        = 0x10,
 	rus_mask         = 0x3C,
 };
@@ -952,7 +960,7 @@ static void e100_get_defaults(struct nic *nic)
 		((nic->mac >= mac_82558_D101_A4) ? cb_cid : cb_i));
 
 	/* Template for a freshly allocated RFD */
-	nic->blank_rfd.command = cpu_to_le16(cb_el);
+	nic->blank_rfd.command = 0;
 	nic->blank_rfd.rbd = 0xFFFFFFFF;
 	nic->blank_rfd.size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
 
@@ -1791,15 +1799,12 @@ static int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)
 	}
 
 	/* Link the RFD to end of RFA by linking previous RFD to
-	 * this one, and clearing EL bit of previous.  */
+	 * this one.  We are safe to touch the previous RFD because
+	 * it is protected by the before last buffer's el bit being set */
 	if(rx->prev->skb) {
 		struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;
 		put_unaligned(cpu_to_le32(rx->dma_addr),
 			(u32 *)&prev_rfd->link);
-		wmb();
-		prev_rfd->command &= ~cpu_to_le16(cb_el);
-		pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,
-			sizeof(struct rfd), PCI_DMA_TODEVICE);
 	}
 
 	return 0;
@@ -1824,8 +1829,19 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx,
 	DPRINTK(RX_STATUS, DEBUG, "status=0x%04X\n", rfd_status);
 
 	/* If data isn't ready, nothing to indicate */
-	if(unlikely(!(rfd_status & cb_complete)))
+	if (unlikely(!(rfd_status & cb_complete))) {
+		/* If the next buffer has the el bit, but we think the receiver
+		 * is still running, check to see if it really stopped while
+		 * we had interrupts off.
+		 * This allows for a fast restart without re-enabling
+		 * interrupts */
+		if ((le16_to_cpu(rfd->command) & cb_el) &&
+		    (RU_RUNNING == nic->ru_running))
+
+			if (readb(&nic->csr->scb.status) & rus_no_res)
+				nic->ru_running = RU_SUSPENDED;
 		return -ENODATA;
+	}
 
 	/* Get actual data size */
 	actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;
@@ -1836,9 +1852,18 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx,
 	pci_unmap_single(nic->pdev, rx->dma_addr,
 		RFD_BUF_LEN, PCI_DMA_FROMDEVICE);
 
-	/* this allows for a fast restart without re-enabling interrupts */
-	if(le16_to_cpu(rfd->command) & cb_el)
+	/* If this buffer has the el bit, but we think the receiver
+	 * is still running, check to see if it really stopped while
+	 * we had interrupts off.
+	 * This allows for a fast restart without re-enabling interrupts.
+	 * This can happen when the RU sees the size change but also sees
+	 * the el bit set. */
+	if ((le16_to_cpu(rfd->command) & cb_el) &&
+	    (RU_RUNNING == nic->ru_running)) {
+
+	    if (readb(&nic->csr->scb.status) & rus_no_res)
 		nic->ru_running = RU_SUSPENDED;
+	}
 
 	/* Pull off the RFD and put the actual data (minus eth hdr) */
 	skb_reserve(skb, sizeof(struct rfd));
@@ -1870,31 +1895,30 @@ static void e100_rx_clean(struct nic *nic, unsigned int *work_done,
 	unsigned int work_to_do)
 {
 	struct rx *rx;
-	int restart_required = 0;
-	struct rx *rx_to_start = NULL;
-
-	/* are we already rnr? then pay attention!!! this ensures that
-	 * the state machine progression never allows a start with a
-	 * partially cleaned list, avoiding a race between hardware
-	 * and rx_to_clean when in NAPI mode */
-	if(RU_SUSPENDED == nic->ru_running)
-		restart_required = 1;
+	int restart_required = 0, err = 0;
+	struct rx *old_before_last_rx, *new_before_last_rx;
+	struct rfd *old_before_last_rfd, *new_before_last_rfd;
 
 	/* Indicate newly arrived packets */
 	for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {
-		int err = e100_rx_indicate(nic, rx, work_done, work_to_do);
-		if(-EAGAIN == err) {
-			/* hit quota so have more work to do, restart once
-			 * cleanup is complete */
-			restart_required = 0;
+		err = e100_rx_indicate(nic, rx, work_done, work_to_do);
+		/* Hit quota or no more to clean */
+		if (-EAGAIN == err || -ENODATA == err)
 			break;
-		} else if(-ENODATA == err)
-			break; /* No more to clean */
 	}
 
-	/* save our starting point as the place we'll restart the receiver */
-	if(restart_required)
-		rx_to_start = nic->rx_to_clean;
+
+	/* On EAGAIN, hit quota so have more work to do, restart once
+	 * cleanup is complete.
+	 * Else, are we already rnr? then pay attention!!! this ensures that
+	 * the state machine progression never allows a start with a
+	 * partially cleaned list, avoiding a race between hardware
+	 * and rx_to_clean when in NAPI mode */
+	if (-EAGAIN != err && RU_SUSPENDED == nic->ru_running)
+		restart_required = 1;
+
+	old_before_last_rx = nic->rx_to_use->prev->prev;
+	old_before_last_rfd = (struct rfd *)old_before_last_rx->skb->data;
 
 	/* Alloc new skbs to refill list */
 	for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {
@@ -1902,10 +1926,42 @@ static void e100_rx_clean(struct nic *nic, unsigned int *work_done,
 			break; /* Better luck next time (see watchdog) */
 	}
 
+	new_before_last_rx = nic->rx_to_use->prev->prev;
+	if (new_before_last_rx != old_before_last_rx) {
+		/* Set the el-bit on the buffer that is before the last buffer.
+		 * This lets us update the next pointer on the last buffer
+		 * without worrying about hardware touching it.
+		 * We set the size to 0 to prevent hardware from touching this
+		 * buffer.
+		 * When the hardware hits the before last buffer with el-bit
+		 * and size of 0, it will RNR interrupt, the RUS will go into
+		 * the No Resources state.  It will not complete nor write to
+		 * this buffer. */
+		new_before_last_rfd =
+			(struct rfd *)new_before_last_rx->skb->data;
+		new_before_last_rfd->size = 0;
+		new_before_last_rfd->command |= cpu_to_le16(cb_el);
+		pci_dma_sync_single_for_device(nic->pdev,
+			new_before_last_rx->dma_addr, sizeof(struct rfd),
+			PCI_DMA_TODEVICE);
+
+		/* Now that we have a new stopping point, we can clear the old
+		 * stopping point.  We must sync twice to get the proper
+		 * ordering on the hardware side of things. */
+		old_before_last_rfd->command &= ~cpu_to_le16(cb_el);
+		pci_dma_sync_single_for_device(nic->pdev,
+			old_before_last_rx->dma_addr, sizeof(struct rfd),
+			PCI_DMA_TODEVICE);
+		old_before_last_rfd->size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
+		pci_dma_sync_single_for_device(nic->pdev,
+			old_before_last_rx->dma_addr, sizeof(struct rfd),
+			PCI_DMA_TODEVICE);
+	}
+
 	if(restart_required) {
 		// ack the rnr?
 		writeb(stat_ack_rnr, &nic->csr->scb.stat_ack);
-		e100_start_receiver(nic, rx_to_start);
+		e100_start_receiver(nic, nic->rx_to_clean);
 		if(work_done)
 			(*work_done)++;
 	}
@@ -1937,6 +1993,7 @@ static int e100_rx_alloc_list(struct nic *nic)
 {
 	struct rx *rx;
 	unsigned int i, count = nic->params.rfds.count;
+	struct rfd *before_last;
 
 	nic->rx_to_use = nic->rx_to_clean = NULL;
 	nic->ru_running = RU_UNINITIALIZED;
@@ -1952,6 +2009,19 @@ static int e100_rx_alloc_list(struct nic *nic)
 			return -ENOMEM;
 		}
 	}
+	/* Set the el-bit on the buffer that is before the last buffer.
+	 * This lets us update the next pointer on the last buffer without
+	 * worrying about hardware touching it.
+	 * We set the size to 0 to prevent hardware from touching this buffer.
+	 * When the hardware hits the before last buffer with el-bit and size
+	 * of 0, it will RNR interrupt, the RU will go into the No Resources
+	 * state.  It will not complete nor write to this buffer. */
+	rx = nic->rxs->prev->prev;
+	before_last = (struct rfd *)rx->skb->data;
+	before_last->command |= cpu_to_le16(cb_el);
+	before_last->size = 0;
+	pci_dma_sync_single_for_device(nic->pdev, rx->dma_addr,
+		sizeof(struct rfd), PCI_DMA_TODEVICE);
 
 	nic->rx_to_use = nic->rx_to_clean = nic->rxs;
 	nic->ru_running = RU_SUSPENDED;

^ permalink raw reply related	[flat|nested] 28+ messages in thread
* [PATCH] Fix e100 on systems that have cache incoherent DMA
@ 2007-11-02 13:27 David Acker
  2007-11-02 16:05 ` Kok, Auke
  2007-11-06 17:01 ` Kok, Auke
  0 siblings, 2 replies; 28+ messages in thread
From: David Acker @ 2007-11-02 13:27 UTC (permalink / raw)
  To: Auke Kok
  Cc: e1000-devel, netdev, Jesse Brandeburg, Milton Miller,
	Scott Feldman, John Ronciak, Jeff Kirsher, Jeff Garzik

On the systems that have cache incoherent DMA, including ARM, there is a
race condition between software allocating a new receive buffer and hardware
writing into a buffer.  The two race on touching the last Receive Frame
Descriptor (RFD).  It has its el-bit set and its next link equal to 0.
When hardware encounters this buffer it attempts to write data to it and
then update Status Word bits and Actual Count in the RFD.  At the same time
software may try to clear the el-bit and set the link address to a new buffer.

Since the entire RFD is once cache-line, the two write operations can collide.
This can lead to the receive unit stalling or interpreting random memory as
its receive area.

The fix is to set the el-bit on and the size to 0 on the next to last buffer
in the chain.  When the hardware encounters this buffer it stops and does not
write to it at all.  The hardware issues an RNR interrupt with the receive
unit in the No Resources state.  Software can write to the tail of the list
because it knows hardware will stop on the previous descriptor that was
marked as the end of list.

Once it has a new next to last buffer prepared, it can clear the el-bit and
set the size on the previous one.  The race on this buffer is safe since
the link already points to a valid next buffer and the software can handle
the race setting the size (assuming aligned 16 bit writes are atomic with
respect to the DMA read). If the hardware sees the el-bit cleared without
the size set, it will move on to the next buffer and skip this one.  If it
sees the size set but the el-bit still set, it will complete that buffer
and then RNR interrupt and wait.


This is a patch for 2.6.24-rc1.

Signed-off-by: David Acker <dacker@roinet.com>

---

This version is based on the simpler patch I did in May.  The algorithm I tried
after that never worked correctly under load.  It would hang the RU and the
transmit unit sometimes and if the card was restarted it would often crash the
system with memory corruption.  This patch was tested on my embedded system
using pktgen.  I had it sending while a PC sent at it.  I also ran it as
wireless access point with a 12-hour bidirectional 20 mbps UDP going between an
ethernet host on the e100 and a wireless client.

--- linux-2.6.24-rc1/drivers/net/e100.c.orig	2007-11-01 11:42:35.000000000 -0400
+++ linux-2.6.24-rc1/drivers/net/e100.c	2007-11-02 09:09:47.000000000 -0400
@@ -106,6 +106,13 @@
  *	the RFD, the RFD must be dma_sync'ed to maintain a consistent
  *	view from software and hardware.
  *
+ *	In order to keep updates to the RFD link field from colliding with
+ *	hardware writes to mark packets complete, we use the feature that
+ *	hardware will not write to a size 0 descriptor and mark the previous
+ *	packet as end-of-list (EL).   After updating the link, we remove EL
+ *	and only then restore the size such that hardware may use the
+ *	previous-to-end RFD.
+ *
  *	Under typical operation, the  receive unit (RU) is start once,
  *	and the controller happily fills RFDs as frames arrive.  If
  *	replacement RFDs cannot be allocated, or the RU goes non-active,
@@ -281,14 +288,15 @@ struct csr {
 };
 
 enum scb_status {
+	rus_no_res       = 0x08,
 	rus_ready        = 0x10,
 	rus_mask         = 0x3C,
 };
 
 enum ru_state  {
-	RU_SUSPENDED = 0,
-	RU_RUNNING	 = 1,
-	RU_UNINITIALIZED = -1,
+	ru_stopped = 0,
+	ru_running	 = 1,
+	ru_uninitialized = -1,
 };
 
 enum scb_stat_ack {
@@ -952,7 +960,7 @@ static void e100_get_defaults(struct nic
 		((nic->mac >= mac_82558_D101_A4) ? cb_cid : cb_i));
 
 	/* Template for a freshly allocated RFD */
-	nic->blank_rfd.command = cpu_to_le16(cb_el);
+	nic->blank_rfd.command = 0;
 	nic->blank_rfd.rbd = 0xFFFFFFFF;
 	nic->blank_rfd.size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
 
@@ -1759,7 +1767,7 @@ static int e100_alloc_cbs(struct nic *ni
 static inline void e100_start_receiver(struct nic *nic, struct rx *rx)
 {
 	if(!nic->rxs) return;
-	if(RU_SUSPENDED != nic->ru_running) return;
+	if (ru_stopped != nic->ru_running) return;
 
 	/* handle init time starts */
 	if(!rx) rx = nic->rxs;
@@ -1767,7 +1775,7 @@ static inline void e100_start_receiver(s
 	/* (Re)start RU if suspended or idle and RFA is non-NULL */
 	if(rx->skb) {
 		e100_exec_cmd(nic, ruc_start, rx->dma_addr);
-		nic->ru_running = RU_RUNNING;
+		nic->ru_running = ru_running;
 	}
 }
 
@@ -1791,15 +1799,12 @@ static int e100_rx_alloc_skb(struct nic 
 	}
 
 	/* Link the RFD to end of RFA by linking previous RFD to
-	 * this one, and clearing EL bit of previous.  */
+	 * this one.  We are safe to touch the previous RFD because
+	 * it is protected by the before last buffer's el bit being set */
 	if(rx->prev->skb) {
 		struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;
 		put_unaligned(cpu_to_le32(rx->dma_addr),
 			(u32 *)&prev_rfd->link);
-		wmb();
-		prev_rfd->command &= ~cpu_to_le16(cb_el);
-		pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,
-			sizeof(struct rfd), PCI_DMA_TODEVICE);
 	}
 
 	return 0;
@@ -1824,8 +1829,20 @@ static int e100_rx_indicate(struct nic *
 	DPRINTK(RX_STATUS, DEBUG, "status=0x%04X\n", rfd_status);
 
 	/* If data isn't ready, nothing to indicate */
-	if(unlikely(!(rfd_status & cb_complete)))
+	if (unlikely(!(rfd_status & cb_complete))) {
+		/* If the next buffer has the el bit, but we think the receiver
+		 * is still running, check to see if it really stopped while
+		 * we had interrupts off.
+		 * This allows for a fast restart without re-enabling
+		 * interrupts */
+		if ((le16_to_cpu(rfd->command) & cb_el) &&
+		    (ru_running == nic->ru_running)) {
+
+		    if (readb(&nic->csr->scb.status) & rus_no_res)
+			nic->ru_running = ru_stopped;
+		}
 		return -ENODATA;
+	}
 
 	/* Get actual data size */
 	actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;
@@ -1836,9 +1853,18 @@ static int e100_rx_indicate(struct nic *
 	pci_unmap_single(nic->pdev, rx->dma_addr,
 		RFD_BUF_LEN, PCI_DMA_FROMDEVICE);
 
-	/* this allows for a fast restart without re-enabling interrupts */
-	if(le16_to_cpu(rfd->command) & cb_el)
-		nic->ru_running = RU_SUSPENDED;
+	/* If this buffer has the el bit, but we think the receiver
+	 * is still running, check to see if it really stopped while
+	 * we had interrupts off.
+	 * This allows for a fast restart without re-enabling interrupts.
+	 * This can happen when the RU sees the size change but also sees
+	 * the el bit set. */
+	if ((le16_to_cpu(rfd->command) & cb_el) &&
+	    (ru_running == nic->ru_running)) {
+
+	    if (readb(&nic->csr->scb.status) & rus_no_res)
+		nic->ru_running = ru_stopped;
+	}
 
 	/* Pull off the RFD and put the actual data (minus eth hdr) */
 	skb_reserve(skb, sizeof(struct rfd));
@@ -1870,31 +1896,30 @@ static void e100_rx_clean(struct nic *ni
 	unsigned int work_to_do)
 {
 	struct rx *rx;
-	int restart_required = 0;
-	struct rx *rx_to_start = NULL;
-
-	/* are we already rnr? then pay attention!!! this ensures that
-	 * the state machine progression never allows a start with a
-	 * partially cleaned list, avoiding a race between hardware
-	 * and rx_to_clean when in NAPI mode */
-	if(RU_SUSPENDED == nic->ru_running)
-		restart_required = 1;
+	int restart_required = 0, err = 0;
+	struct rx *old_before_last_rx, *new_before_last_rx;
+	struct rfd *old_before_last_rfd, *new_before_last_rfd;
 
 	/* Indicate newly arrived packets */
 	for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {
-		int err = e100_rx_indicate(nic, rx, work_done, work_to_do);
-		if(-EAGAIN == err) {
-			/* hit quota so have more work to do, restart once
-			 * cleanup is complete */
-			restart_required = 0;
+		err = e100_rx_indicate(nic, rx, work_done, work_to_do);
+		/* Hit quota or no more to clean */
+		if (-EAGAIN == err || -ENODATA == err)
 			break;
-		} else if(-ENODATA == err)
-			break; /* No more to clean */
 	}
 
-	/* save our starting point as the place we'll restart the receiver */
-	if(restart_required)
-		rx_to_start = nic->rx_to_clean;
+
+	/* On EAGAIN, hit quota so have more work to do, restart once
+	 * cleanup is complete.
+	 * Else, are we already rnr? then pay attention!!! this ensures that
+	 * the state machine progression never allows a start with a
+	 * partially cleaned list, avoiding a race between hardware
+	 * and rx_to_clean when in NAPI mode */
+	if (-EAGAIN != err && ru_stopped == nic->ru_running)
+		restart_required = 1;
+
+	old_before_last_rx = nic->rx_to_use->prev->prev;
+	old_before_last_rfd = (struct rfd *)old_before_last_rx->skb->data;
 
 	/* Alloc new skbs to refill list */
 	for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {
@@ -1902,10 +1927,42 @@ static void e100_rx_clean(struct nic *ni
 			break; /* Better luck next time (see watchdog) */
 	}
 
+	new_before_last_rx = nic->rx_to_use->prev->prev;
+	if (new_before_last_rx != old_before_last_rx) {
+		/* Set the el-bit on the buffer that is before the last buffer.
+		 * This lets us update the next pointer on the last buffer
+		 * without worrying about hardware touching it.
+		 * We set the size to 0 to prevent hardware from touching this
+		 * buffer.
+		 * When the hardware hits the before last buffer with el-bit
+		 * and size of 0, it will RNR interrupt, the RUS will go into
+		 * the No Resources state.  It will not complete nor write to
+		 * this buffer. */
+		new_before_last_rfd =
+			(struct rfd *)new_before_last_rx->skb->data;
+		new_before_last_rfd->size = 0;
+		new_before_last_rfd->command |= cpu_to_le16(cb_el);
+		pci_dma_sync_single_for_device(nic->pdev,
+			new_before_last_rx->dma_addr, sizeof(struct rfd),
+			PCI_DMA_TODEVICE);
+
+		/* Now that we have a new stopping point, we can clear the old
+		 * stopping point.  We must sync twice to get the proper
+		 * ordering on the hardware side of things. */
+		old_before_last_rfd->command &= ~cpu_to_le16(cb_el);
+		pci_dma_sync_single_for_device(nic->pdev,
+			old_before_last_rx->dma_addr, sizeof(struct rfd),
+			PCI_DMA_TODEVICE);
+		old_before_last_rfd->size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
+		pci_dma_sync_single_for_device(nic->pdev,
+			old_before_last_rx->dma_addr, sizeof(struct rfd),
+			PCI_DMA_TODEVICE);
+	}
+
 	if(restart_required) {
 		// ack the rnr?
 		writeb(stat_ack_rnr, &nic->csr->scb.stat_ack);
-		e100_start_receiver(nic, rx_to_start);
+		e100_start_receiver(nic, nic->rx_to_clean);
 		if(work_done)
 			(*work_done)++;
 	}
@@ -1916,7 +1973,7 @@ static void e100_rx_clean_list(struct ni
 	struct rx *rx;
 	unsigned int i, count = nic->params.rfds.count;
 
-	nic->ru_running = RU_UNINITIALIZED;
+	nic->ru_running = ru_uninitialized;
 
 	if(nic->rxs) {
 		for(rx = nic->rxs, i = 0; i < count; rx++, i++) {
@@ -1937,9 +1994,10 @@ static int e100_rx_alloc_list(struct nic
 {
 	struct rx *rx;
 	unsigned int i, count = nic->params.rfds.count;
+	struct rfd *before_last;
 
 	nic->rx_to_use = nic->rx_to_clean = NULL;
-	nic->ru_running = RU_UNINITIALIZED;
+	nic->ru_running = ru_uninitialized;
 
 	if(!(nic->rxs = kcalloc(count, sizeof(struct rx), GFP_ATOMIC)))
 		return -ENOMEM;
@@ -1952,9 +2010,22 @@ static int e100_rx_alloc_list(struct nic
 			return -ENOMEM;
 		}
 	}
+	/* Set the el-bit on the buffer that is before the last buffer.
+	 * This lets us update the next pointer on the last buffer without
+	 * worrying about hardware touching it.
+	 * We set the size to 0 to prevent hardware from touching this buffer.
+	 * When the hardware hits the before last buffer with el-bit and size
+	 * of 0, it will RNR interrupt, the RU will go into the No Resources
+	 * state.  It will not complete nor write to this buffer. */
+	rx = nic->rxs->prev->prev;
+	before_last = (struct rfd *)rx->skb->data;
+	before_last->command |= cpu_to_le16(cb_el);
+	before_last->size = 0;
+	pci_dma_sync_single_for_device(nic->pdev, rx->dma_addr,
+		sizeof(struct rfd), PCI_DMA_TODEVICE);
 
 	nic->rx_to_use = nic->rx_to_clean = nic->rxs;
-	nic->ru_running = RU_SUSPENDED;
+	nic->ru_running = ru_stopped;
 
 	return 0;
 }
@@ -1976,7 +2047,7 @@ static irqreturn_t e100_intr(int irq, vo
 
 	/* We hit Receive No Resource (RNR); restart RU after cleaning */
 	if(stat_ack & stat_ack_rnr)
-		nic->ru_running = RU_SUSPENDED;
+		nic->ru_running = ru_stopped;
 
 	if(likely(netif_rx_schedule_prep(netdev, &nic->napi))) {
 		e100_disable_irq(nic);

-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 28+ messages in thread
* [PATCH] Fix e100 on systems that have cache incoherent DMA
@ 2007-08-31 20:54 David Acker
  2007-09-04 17:02 ` Kok, Auke
  2007-09-07 16:31 ` Kok, Auke
  0 siblings, 2 replies; 28+ messages in thread
From: David Acker @ 2007-08-31 20:54 UTC (permalink / raw)
  To: Auke Kok
  Cc: John Ronciak, Jesse Brandeburg, Jeff Kirsher, Milton Miller,
	Jeff Garzik, netdev, e1000-devel, Scott Feldman

On the systems that have cache incoherent DMA, including ARM, there is a
race condition between software allocating a new receive buffer and hardware
writing into a buffer.  The two race on touching the last Receive Frame
Descriptor (RFD).  It has its el-bit set and its next link equal to 0.
When hardware encounters this buffer it attempts to write data to it and
then update Status Word bits and Actual Count in the RFD.  At the same time
software may try to clear the el-bit and set the link address to a new buffer.

Since the entire RFD is once cache-line, the two write operations can collide.
This can lead to the receive unit stalling or interpreting random memory as
its receive area.

The fix is to set the el-bit on and the size to 0 on the next to last buffer
in the chain.  When the hardware encounters this buffer it stops and does not
write to it at all.  The hardware issues an RNR interrupt with the receive
unit in the No Resources state.  Software can write to the tail of the list
because it knows hardware will stop on the previous descriptor that was
marked as the end of list.

Once it has a new next to last buffer prepared, it can clear the el-bit and
set the size on the previous one.  The race on this buffer is safe since
the link already points to a valid next buffer and the software can handle
the race setting the size (assuming aligned 16 bit writes are atomic with
respect to the DMA read). If the hardware sees the el-bit cleared without
the size set, it will move on to the next buffer and skip this one.  If it
sees the size set but the el-bit still set, it will complete that buffer
and then RNR interrupt and wait.

Flags are kept in the software descriptor to note if the el bit is set and if
the size was 0.  When software clears the RFD's el bit and set its size, it
also clears the el flag but leaves the size was 0 bit set.  This way software
can identify them when the race may have occurred when cleaning the ring.
On these descriptors, it looks ahead and if the next one is complete then
hardware must have skipped the current one.  Logic is added to prevent two
packets in a row being marked while the receiver is running to avoid running
in lockstep with the hardware and thereby limiting the required lookahead.

This is a patch for 2.6.23-rc4.

Signed-off-by: David Acker <dacker@roinet.com>

---

--- linux-2.6.23-rc4/drivers/net/e100.c.orig	2007-08-30 13:32:10.000000000 -0400
+++ linux-2.6.23-rc4/drivers/net/e100.c	2007-08-30 15:42:07.000000000 -0400
@@ -106,6 +106,13 @@
  *	the RFD, the RFD must be dma_sync'ed to maintain a consistent
  *	view from software and hardware.
  *
+ *	In order to keep updates to the RFD link field from colliding with
+ *	hardware writes to mark packets complete, we use the feature that
+ *	hardware will not write to a size 0 descriptor and mark the previous
+ *	packet as end-of-list (EL).   After updating the link, we remove EL
+ *	and only then restore the size such that hardware may use the
+ *	previous-to-end RFD. 
+ *
  *	Under typical operation, the  receive unit (RU) is start once,
  *	and the controller happily fills RFDs as frames arrive.  If
  *	replacement RFDs cannot be allocated, or the RU goes non-active,
@@ -281,14 +288,14 @@ struct csr {
 };
 
 enum scb_status {
+	rus_no_res       = 0x08,
 	rus_ready        = 0x10,
 	rus_mask         = 0x3C,
 };
 
 enum ru_state  {
-	RU_SUSPENDED = 0,
-	RU_RUNNING	 = 1,
-	RU_UNINITIALIZED = -1,
+	ru_stopped = 0,
+	ru_running = 1,
 };
 
 enum scb_stat_ack {
@@ -401,10 +408,16 @@ struct rfd {
 	u16 size;
 };
 
+enum rx_flags {
+	rx_el = 0x01,
+	rx_s0 = 0x02,
+};
+
 struct rx {
 	struct rx *next, *prev;
 	struct sk_buff *skb;
 	dma_addr_t dma_addr;
+	u8 flags;
 };
 
 #if defined(__BIG_ENDIAN_BITFIELD)
@@ -952,7 +965,7 @@ static void e100_get_defaults(struct nic
 		((nic->mac >= mac_82558_D101_A4) ? cb_cid : cb_i));
 
 	/* Template for a freshly allocated RFD */
-	nic->blank_rfd.command = cpu_to_le16(cb_el);
+	nic->blank_rfd.command = 0;
 	nic->blank_rfd.rbd = 0xFFFFFFFF;
 	nic->blank_rfd.size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
 
@@ -1753,18 +1766,48 @@ static int e100_alloc_cbs(struct nic *ni
 	return 0;
 }
 
-static inline void e100_start_receiver(struct nic *nic, struct rx *rx)
+static void e100_find_mark_el(struct nic *nic, struct rx *marked_rx, int is_running)
 {
-	if(!nic->rxs) return;
-	if(RU_SUSPENDED != nic->ru_running) return;
+	struct rx *rx = nic->rx_to_use->prev->prev;
+	struct rfd *rfd;
+
+	if (marked_rx == rx)
+		return;
+
+	rfd = (struct rfd *) rx->skb->data;
+	rfd->command |= cpu_to_le16(cb_el);
+	rfd->size = 0;
+	pci_dma_sync_single_for_device(nic->pdev, rx->dma_addr,
+		sizeof(struct rfd), PCI_DMA_BIDIRECTIONAL);
+	rx->flags |= (rx_el | rx_s0);
+
+	if (!marked_rx)
+		return;
+
+	rfd = (struct rfd *) marked_rx->skb->data;
+	rfd->command &= ~cpu_to_le16(cb_el);
+	pci_dma_sync_single_for_device(nic->pdev, marked_rx->dma_addr,
+		sizeof(struct rfd), PCI_DMA_BIDIRECTIONAL);
+
+	rfd->size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
+	pci_dma_sync_single_for_device(nic->pdev, marked_rx->dma_addr,
+		sizeof(struct rfd), PCI_DMA_BIDIRECTIONAL);
 
-	/* handle init time starts */
-	if(!rx) rx = nic->rxs;
+	if (is_running)
+		marked_rx->flags &= ~rx_el;
+	else
+		marked_rx->flags &= ~(rx_el | rx_s0);
+}
+
+static inline void e100_start_receiver(struct nic *nic)
+{
+	if(!nic->rxs) return;
+	if (ru_stopped != nic->ru_running) return;
 
 	/* (Re)start RU if suspended or idle and RFA is non-NULL */
-	if(rx->skb) {
-		e100_exec_cmd(nic, ruc_start, rx->dma_addr);
-		nic->ru_running = RU_RUNNING;
+	if (nic->rx_to_clean->skb) {
+		e100_exec_cmd(nic, ruc_start, nic->rx_to_clean->dma_addr);
+		nic->ru_running = ru_running;
 	}
 }
 
@@ -1793,8 +1836,6 @@ static int e100_rx_alloc_skb(struct nic 
 		struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;
 		put_unaligned(cpu_to_le32(rx->dma_addr),
 			(u32 *)&prev_rfd->link);
-		wmb();
-		prev_rfd->command &= ~cpu_to_le16(cb_el);
 		pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,
 			sizeof(struct rfd), PCI_DMA_TODEVICE);
 	}
@@ -1808,6 +1849,7 @@ static int e100_rx_indicate(struct nic *
 	struct sk_buff *skb = rx->skb;
 	struct rfd *rfd = (struct rfd *)skb->data;
 	u16 rfd_status, actual_size;
+	u8 status;
 
 	if(unlikely(work_done && *work_done >= work_to_do))
 		return -EAGAIN;
@@ -1819,9 +1861,47 @@ static int e100_rx_indicate(struct nic *
 
 	DPRINTK(RX_STATUS, DEBUG, "status=0x%04X\n", rfd_status);
 
-	/* If data isn't ready, nothing to indicate */
-	if(unlikely(!(rfd_status & cb_complete)))
+	/* 
+	 * If data isn't ready, nothing to indicate
+	 * If both the el and s0 rx flags are set, we have hit the marked
+	 * buffer but we don't know if hardware has seen it so we check
+	 * the status.
+	 * If only the s0 flag is set, we check the next buffer.
+	 * If it is complete, we know that hardware saw the rfd el bit
+	 * get cleared but did not see the rfd size get set so it
+	 * skipped this buffer.  We just return 0 and look at the
+	 * next buffer.
+	 * If only the s0 flag is set but the next buffer is
+	 * not complete, we cleared the el flag as hardware
+	 * hit this buffer.
+	 */
+	if (unlikely(!(rfd_status & cb_complete))) {
+		u8 maskedFlags = rx->flags & (rx_el | rx_s0);
+		if (maskedFlags == (rx_el | rx_s0)) {
+			status = readb(&nic->csr->scb.status);
+			if (status & rus_no_res)
+				nic->ru_running = ru_stopped;
+		} else if (maskedFlags == rx_s0) {
+			struct rx *next_rx = rx->next;
+			struct rfd *next_rfd = (struct rfd *)next_rx->skb->data;
+			pci_dma_sync_single_for_cpu(nic->pdev,
+				next_rx->dma_addr, sizeof(struct rfd),
+				PCI_DMA_FROMDEVICE);
+			if (next_rfd->status & cpu_to_le16(cb_complete)) {
+				pci_unmap_single(nic->pdev, rx->dma_addr,
+					RFD_BUF_LEN, PCI_DMA_FROMDEVICE);
+				dev_kfree_skb_any(skb);
+				rx->skb = NULL;
+				rx->flags &= ~rx_s0;
+				return 0;
+			} else {
+				status = readb(&nic->csr->scb.status);
+				if (status & rus_no_res)
+					nic->ru_running = ru_stopped;
+			}
+		}
 		return -ENODATA;
+	}
 
 	/* Get actual data size */
 	actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;
@@ -1832,9 +1912,15 @@ static int e100_rx_indicate(struct nic *
 	pci_unmap_single(nic->pdev, rx->dma_addr,
 		RFD_BUF_LEN, PCI_DMA_FROMDEVICE);
 
-	/* this allows for a fast restart without re-enabling interrupts */
-	if(le16_to_cpu(rfd->command) & cb_el)
-		nic->ru_running = RU_SUSPENDED;
+	/*
+	 * This happens when hardward sees the rfd el flag set
+	 * but then sees the rfd size set as well
+	 */
+	if (le16_to_cpu(rfd->command) & cb_el) {
+		status = readb(&nic->csr->scb.status);
+		if (status & rus_no_res)
+			nic->ru_running = ru_stopped;
+	}
 
 	/* Pull off the RFD and put the actual data (minus eth hdr) */
 	skb_reserve(skb, sizeof(struct rfd));
@@ -1865,32 +1951,34 @@ static int e100_rx_indicate(struct nic *
 static void e100_rx_clean(struct nic *nic, unsigned int *work_done,
 	unsigned int work_to_do)
 {
-	struct rx *rx;
+	struct rx *rx, *marked_rx;
 	int restart_required = 0;
-	struct rx *rx_to_start = NULL;
-
-	/* are we already rnr? then pay attention!!! this ensures that
-	 * the state machine progression never allows a start with a
-	 * partially cleaned list, avoiding a race between hardware
-	 * and rx_to_clean when in NAPI mode */
-	if(RU_SUSPENDED == nic->ru_running)
-		restart_required = 1;
+	int err = 0;
 
 	/* Indicate newly arrived packets */
 	for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {
-		int err = e100_rx_indicate(nic, rx, work_done, work_to_do);
-		if(-EAGAIN == err) {
-			/* hit quota so have more work to do, restart once
-			 * cleanup is complete */
-			restart_required = 0;
+		err = e100_rx_indicate(nic, rx, work_done, work_to_do);
+		/* Hit quota or no more to clean */
+		if(-EAGAIN == err || -ENODATA == err)
 			break;
-		} else if(-ENODATA == err)
-			break; /* No more to clean */
 	}
 
-	/* save our starting point as the place we'll restart the receiver */
-	if(restart_required)
-		rx_to_start = nic->rx_to_clean;
+	/*
+	 * On EAGAIN, hit quota so have more work to do, restart once
+	 * cleanup is complete.
+	 * Else, are we already rnr? then pay attention!!! this ensures that
+	 * the state machine progression never allows a start with a
+	 * partially cleaned list, avoiding a race between hardware
+	 * and rx_to_clean when in NAPI mode
+	 */
+	if(-EAGAIN != err && ru_stopped == nic->ru_running)
+		restart_required = 1;
+
+	marked_rx = nic->rx_to_use->prev->prev;
+	if (!(marked_rx->flags & rx_el)) {
+		marked_rx = marked_rx->prev;
+		BUG_ON(!marked_rx->flags & rx_el);
+	}
 
 	/* Alloc new skbs to refill list */
 	for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {
@@ -1898,10 +1986,12 @@ static void e100_rx_clean(struct nic *ni
 			break; /* Better luck next time (see watchdog) */
 	}
 
+	e100_find_mark_el(nic, marked_rx, !restart_required);
+
 	if(restart_required) {
 		// ack the rnr?
 		writeb(stat_ack_rnr, &nic->csr->scb.stat_ack);
-		e100_start_receiver(nic, rx_to_start);
+		e100_start_receiver(nic);
 		if(work_done)
 			(*work_done)++;
 	}
@@ -1912,8 +2002,6 @@ static void e100_rx_clean_list(struct ni
 	struct rx *rx;
 	unsigned int i, count = nic->params.rfds.count;
 
-	nic->ru_running = RU_UNINITIALIZED;
-
 	if(nic->rxs) {
 		for(rx = nic->rxs, i = 0; i < count; rx++, i++) {
 			if(rx->skb) {
@@ -1935,7 +2023,6 @@ static int e100_rx_alloc_list(struct nic
 	unsigned int i, count = nic->params.rfds.count;
 
 	nic->rx_to_use = nic->rx_to_clean = NULL;
-	nic->ru_running = RU_UNINITIALIZED;
 
 	if(!(nic->rxs = kcalloc(count, sizeof(struct rx), GFP_ATOMIC)))
 		return -ENOMEM;
@@ -1950,7 +2037,9 @@ static int e100_rx_alloc_list(struct nic
 	}
 
 	nic->rx_to_use = nic->rx_to_clean = nic->rxs;
-	nic->ru_running = RU_SUSPENDED;
+	nic->ru_running = ru_stopped;
+
+	e100_find_mark_el(nic, NULL, 0);
 
 	return 0;
 }
@@ -1971,8 +2060,8 @@ static irqreturn_t e100_intr(int irq, vo
 	iowrite8(stat_ack, &nic->csr->scb.stat_ack);
 
 	/* We hit Receive No Resource (RNR); restart RU after cleaning */
-	if(stat_ack & stat_ack_rnr)
-		nic->ru_running = RU_SUSPENDED;
+	if (stat_ack & stat_ack_rnr)
+		nic->ru_running = ru_stopped;
 
 	if(likely(netif_rx_schedule_prep(netdev))) {
 		e100_disable_irq(nic);
@@ -2065,7 +2154,7 @@ static int e100_up(struct nic *nic)
 	if((err = e100_hw_init(nic)))
 		goto err_clean_cbs;
 	e100_set_multicast_list(nic->netdev);
-	e100_start_receiver(nic, NULL);
+	e100_start_receiver(nic);
 	mod_timer(&nic->watchdog, jiffies);
 	if((err = request_irq(nic->pdev->irq, e100_intr, IRQF_SHARED,
 		nic->netdev->name, nic->netdev)))
@@ -2146,7 +2235,7 @@ static int e100_loopback_test(struct nic
 		mdio_write(nic->netdev, nic->mii.phy_id, MII_BMCR,
 			BMCR_LOOPBACK);
 
-	e100_start_receiver(nic, NULL);
+	e100_start_receiver(nic);
 
 	if(!(skb = netdev_alloc_skb(nic->netdev, ETH_DATA_LEN))) {
 		err = -ENOMEM;

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2008-07-02 17:45 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-08 18:17 [PATCH] Fix e100 on systems that have cache incoherent DMA Auke Kok
2007-11-28 19:12 ` David Acker
2007-11-28 19:21   ` Kok, Auke
2007-11-28 19:26     ` Jeff Garzik
2007-11-28 19:50       ` David Acker
2008-06-18 18:54         ` Anders Grafström
2008-06-18 19:16           ` David Acker
2008-06-19 12:38             ` Anders Grafström
2008-07-01  8:26               ` Andrew Morton
2008-07-01  9:49                 ` Jeff Garzik
2008-07-01 18:07                   ` Andrew Morton
2008-07-02 17:36                     ` Anders Grafström
2008-07-02 17:45                       ` Andrew Morton
2008-07-01 21:35           ` David Acker
  -- strict thread matches above, loose matches on Subject: below --
2007-11-02 13:27 David Acker
2007-11-02 16:05 ` Kok, Auke
2007-11-02 16:11   ` Jeff Garzik
2007-11-06 17:01 ` Kok, Auke
2007-08-31 20:54 David Acker
2007-09-04 17:02 ` Kok, Auke
2007-09-07 16:31 ` Kok, Auke
2007-09-07 20:41   ` David Acker
2007-09-07 21:03     ` Kok, Auke
2007-09-07 21:18       ` Kok, Auke
2007-09-07 23:24     ` Jeff Garzik
2007-09-11 20:54       ` David Acker
2007-09-12 11:30         ` James Chapman
2007-09-12 20:11           ` David Acker

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).