Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 10/46] net: jme: convert to generic DMA API
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev; +Cc: Guo-Fu Tseng
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

This also fixes bad pci_dma_map_page() usage and missing RX unmaps.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/jme.c |   37 ++++++++++++-------------------------
 1 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 6b2a5e7..ad69dae 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -759,11 +759,8 @@ jme_make_new_rx_buf(struct jme_adapter *jme, int i)
 
 	rxbi->skb = skb;
 	rxbi->len = skb_tailroom(skb);
-	rxbi->mapping = pci_map_page(jme->pdev,
-					virt_to_page(skb->data),
-					offset_in_page(skb->data),
-					rxbi->len,
-					PCI_DMA_FROMDEVICE);
+	rxbi->mapping = dma_map_single(&jme->pdev->dev, skb->data,
+					rxbi->len, DMA_FROM_DEVICE);
 
 	return 0;
 }
@@ -776,10 +773,10 @@ jme_free_rx_buf(struct jme_adapter *jme, int i)
 	rxbi += i;
 
 	if (rxbi->skb) {
-		pci_unmap_page(jme->pdev,
+		dma_unmap_single(&jme->pdev->dev,
 				 rxbi->mapping,
 				 rxbi->len,
-				 PCI_DMA_FROMDEVICE);
+				 DMA_FROM_DEVICE);
 		dev_kfree_skb(rxbi->skb);
 		rxbi->skb = NULL;
 		rxbi->mapping = 0;
@@ -1022,17 +1019,12 @@ jme_alloc_and_feed_skb(struct jme_adapter *jme, int idx)
 	rxbi += idx;
 
 	skb = rxbi->skb;
-	pci_dma_sync_single_for_cpu(jme->pdev,
-					rxbi->mapping,
-					rxbi->len,
-					PCI_DMA_FROMDEVICE);
+	dma_unmap_single(&jme->pdev->dev, rxbi->mapping, rxbi->len,
+			 DMA_FROM_DEVICE);
 
 	if (unlikely(jme_make_new_rx_buf(jme, idx))) {
-		pci_dma_sync_single_for_device(jme->pdev,
-						rxbi->mapping,
-						rxbi->len,
-						PCI_DMA_FROMDEVICE);
-
+		rxbi->mapping = dma_map_single(&jme->pdev->dev, skb->data,
+						rxbi->len, DMA_FROM_DEVICE);
 		++(NET_STAT(jme).rx_dropped);
 	} else {
 		framesize = le16_to_cpu(rxdesc->descwb.framesize)
@@ -1476,10 +1468,10 @@ jme_tx_clean_tasklet(unsigned long arg)
 				ttxbi = txbi + ((i + j) & (mask));
 				txdesc[(i + j) & (mask)].dw[0] = 0;
 
-				pci_unmap_page(jme->pdev,
+				dma_unmap_page(&jme->pdev->dev,
 						 ttxbi->mapping,
 						 ttxbi->len,
-						 PCI_DMA_TODEVICE);
+						 DMA_TO_DEVICE);
 
 				ttxbi->mapping = 0;
 				ttxbi->len = 0;
@@ -1883,16 +1875,11 @@ jme_fill_tx_map(struct pci_dev *pdev,
 {
 	dma_addr_t dmaaddr;
 
-	dmaaddr = pci_map_page(pdev,
+	dmaaddr = dma_map_page(&pdev->dev,
 				page,
 				page_offset,
 				len,
-				PCI_DMA_TODEVICE);
-
-	pci_dma_sync_single_for_device(pdev,
-				       dmaaddr,
-				       len,
-				       PCI_DMA_TODEVICE);
+				DMA_TO_DEVICE);
 
 	txdesc->dw[0] = 0;
 	txdesc->dw[1] = 0;
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 09/46] net: octeon_mgmt: fix DMA unmap size
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Also: use netdev_alloc_skb_ip_align() for readability.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/octeon/octeon_mgmt.c |    9 ++++-----
 1 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/octeon/octeon_mgmt.c b/drivers/net/octeon/octeon_mgmt.c
index 429e08c..dd4a57a 100644
--- a/drivers/net/octeon/octeon_mgmt.c
+++ b/drivers/net/octeon/octeon_mgmt.c
@@ -152,16 +152,15 @@ static void octeon_mgmt_rx_fill_ring(struct net_device *netdev)
 		struct sk_buff *skb;
 
 		/* CN56XX pass 1 needs 8 bytes of padding.  */
-		size = netdev->mtu + OCTEON_MGMT_RX_HEADROOM + 8 + NET_IP_ALIGN;
+		size = netdev->mtu + OCTEON_MGMT_RX_HEADROOM + 8;
 
-		skb = netdev_alloc_skb(netdev, size);
+		skb = netdev_alloc_skb_ip_align(netdev, size);
 		if (!skb)
 			break;
-		skb_reserve(skb, NET_IP_ALIGN);
 		__skb_queue_tail(&p->rx_list, skb);
 
 		re.d64 = 0;
-		re.s.len = size;
+		re.s.len = size = skb_tailroom(skb);
 		re.s.addr = dma_map_single(p->dev, skb->data,
 					   size,
 					   DMA_FROM_DEVICE);
@@ -297,7 +296,7 @@ static u64 octeon_mgmt_dequeue_rx_buffer(struct octeon_mgmt *p,
 	*pskb = __skb_dequeue(&p->rx_list);
 
 	dma_unmap_single(p->dev, re.s.addr,
-			 ETH_FRAME_LEN + OCTEON_MGMT_RX_HEADROOM,
+			 skb_tailroom(*pskb),
 			 DMA_FROM_DEVICE);
 
 	return re.d64;
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 16/46] net: cxgb3: don't drop packets on memory pressure in driver
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev; +Cc: Divy Le Ray
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Dropping received packets should be left to upper layers.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/cxgb3/sge.c |    8 ++------
 1 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 3196fdd..d322d3e 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -66,7 +66,6 @@
 #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
 #define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
 
-#define SGE_RX_DROP_THRES 16
 #define RX_RECLAIM_PERIOD (HZ/4)
 
 /*
@@ -2338,13 +2337,10 @@ no_mem:
 				}
 
 				skb = get_packet_pg(adap, fl, q,
-						    G_RSPD_LEN(len),
-						    eth ?
-						    SGE_RX_DROP_THRES : 0);
+						    G_RSPD_LEN(len), 0);
 				q->pg_skb = skb;
 			} else
-				skb = get_packet(adap, fl, G_RSPD_LEN(len),
-						 eth ? SGE_RX_DROP_THRES : 0);
+				skb = get_packet(adap, fl, G_RSPD_LEN(len), 0);
 			if (unlikely(!skb)) {
 				if (!eth)
 					goto no_mem;
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 13/46] net: sunbmac: cleanup RX skb allocation
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/sunbmac.c |   22 +++++++++++-----------
 drivers/net/sunbmac.h |   18 +-----------------
 2 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index 297a424..e28d3ea 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -228,15 +228,15 @@ static void bigmac_init_rings(struct bigmac *bp, int from_irq)
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		struct sk_buff *skb;
 
-		skb = big_mac_alloc_skb(RX_BUF_ALLOC_SIZE, gfp_flags);
+		skb = __netdev_alloc_skb_aligned(dev,
+						 RX_BUF_ALLOC_SIZE,
+						 SUNBMAC_RX_ALIGNMENT,
+						 gfp_flags);
 		if (!skb)
 			continue;
 
 		bp->rx_skbs[i] = skb;
-		skb->dev = dev;
 
-		/* Because we reserve afterwards. */
-		skb_put(skb, ETH_FRAME_LEN);
 		skb_reserve(skb, 34);
 
 		bb->be_rxd[i].rx_addr =
@@ -828,7 +828,10 @@ static void bigmac_rx(struct bigmac *bp)
 			struct sk_buff *new_skb;
 
 			/* Now refill the entry, if we can. */
-			new_skb = big_mac_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
+			new_skb = __netdev_alloc_skb_aligned(dev,
+							     RX_BUF_ALLOC_SIZE,
+							     SUNBMAC_RX_ALIGNMENT,
+							     GFP_ATOMIC);
 			if (new_skb == NULL) {
 				drops++;
 				goto drop_it;
@@ -838,8 +841,6 @@ static void bigmac_rx(struct bigmac *bp)
 					 RX_BUF_ALLOC_SIZE - 34,
 					 DMA_FROM_DEVICE);
 			bp->rx_skbs[elem] = new_skb;
-			new_skb->dev = bp->dev;
-			skb_put(new_skb, ETH_FRAME_LEN);
 			skb_reserve(new_skb, 34);
 			this->rx_addr =
 				dma_map_single(&bp->bigmac_op->dev,
@@ -849,16 +850,15 @@ static void bigmac_rx(struct bigmac *bp)
 			this->rx_flags =
 				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - 34) & RXD_LENGTH));
 
-			/* Trim the original skb for the netif. */
-			skb_trim(skb, len);
+			skb_put(skb, len);
 		} else {
-			struct sk_buff *copy_skb = dev_alloc_skb(len + 2);
+			struct sk_buff *copy_skb =
+				netdev_alloc_skb_ip_align(dev, len);
 
 			if (copy_skb == NULL) {
 				drops++;
 				goto drop_it;
 			}
-			skb_reserve(copy_skb, 2);
 			skb_put(copy_skb, len);
 			dma_sync_single_for_cpu(&bp->bigmac_op->dev,
 						this->rx_addr, len,
diff --git a/drivers/net/sunbmac.h b/drivers/net/sunbmac.h
index 4943e97..63dab2f 100644
--- a/drivers/net/sunbmac.h
+++ b/drivers/net/sunbmac.h
@@ -334,22 +334,6 @@ struct bigmac {
 	struct net_device	*dev;
 };
 
-/* We use this to acquire receive skb's that we can DMA directly into. */
-#define ALIGNED_RX_SKB_ADDR(addr) \
-        ((((unsigned long)(addr) + (64 - 1)) & ~(64 - 1)) - (unsigned long)(addr))
-
-static inline struct sk_buff *big_mac_alloc_skb(unsigned int length, gfp_t gfp_flags)
-{
-	struct sk_buff *skb;
-
-	skb = alloc_skb(length + 64, gfp_flags);
-	if(skb) {
-		int offset = ALIGNED_RX_SKB_ADDR(skb->data);
-
-		if(offset)
-			skb_reserve(skb, offset);
-	}
-	return skb;
-}
+#define SUNBMAC_RX_ALIGNMENT 64
 
 #endif /* !(_SUNBMAC_H) */
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 14/46] net: sunbmac: cleanup magic '34'
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Offset of 34 bytes (32+2) after aligning the skb->data to 64 looks
suspicious. Remove the alignment, and use NET_IP_ALIGN instead of the magic.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/sunbmac.c |   32 ++++++++++++++------------------
 1 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index e28d3ea..efc5389 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -228,24 +228,21 @@ static void bigmac_init_rings(struct bigmac *bp, int from_irq)
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		struct sk_buff *skb;
 
-		skb = __netdev_alloc_skb_aligned(dev,
-						 RX_BUF_ALLOC_SIZE,
-						 SUNBMAC_RX_ALIGNMENT,
-						 gfp_flags);
+		skb = __netdev_alloc_skb(dev, RX_BUF_ALLOC_SIZE,
+					 gfp_flags);
 		if (!skb)
 			continue;
 
 		bp->rx_skbs[i] = skb;
-
-		skb_reserve(skb, 34);
+		skb_reserve(skb, NET_IP_ALIGN);
 
 		bb->be_rxd[i].rx_addr =
 			dma_map_single(&bp->bigmac_op->dev,
 				       skb->data,
-				       RX_BUF_ALLOC_SIZE - 34,
+				       RX_BUF_ALLOC_SIZE - NET_IP_ALIGN,
 				       DMA_FROM_DEVICE);
 		bb->be_rxd[i].rx_flags =
-			(RXD_OWN | ((RX_BUF_ALLOC_SIZE - 34) & RXD_LENGTH));
+			(RXD_OWN | ((RX_BUF_ALLOC_SIZE - NET_IP_ALIGN) & RXD_LENGTH));
 	}
 
 	for (i = 0; i < TX_RING_SIZE; i++)
@@ -820,7 +817,7 @@ static void bigmac_rx(struct bigmac *bp)
 			/* Return it to the BigMAC. */
 			bp->enet_stats.rx_dropped++;
 			this->rx_flags =
-				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - 34) & RXD_LENGTH));
+				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - NET_IP_ALIGN) & RXD_LENGTH));
 			goto next;
 		}
 		skb = bp->rx_skbs[elem];
@@ -828,27 +825,26 @@ static void bigmac_rx(struct bigmac *bp)
 			struct sk_buff *new_skb;
 
 			/* Now refill the entry, if we can. */
-			new_skb = __netdev_alloc_skb_aligned(dev,
-							     RX_BUF_ALLOC_SIZE,
-							     SUNBMAC_RX_ALIGNMENT,
-							     GFP_ATOMIC);
+			new_skb = __netdev_alloc_skb(dev,
+						     RX_BUF_ALLOC_SIZE,
+						     GFP_ATOMIC);
 			if (new_skb == NULL) {
 				drops++;
 				goto drop_it;
 			}
 			dma_unmap_single(&bp->bigmac_op->dev,
 					 this->rx_addr,
-					 RX_BUF_ALLOC_SIZE - 34,
+					 RX_BUF_ALLOC_SIZE - NET_IP_ALIGN,
 					 DMA_FROM_DEVICE);
 			bp->rx_skbs[elem] = new_skb;
-			skb_reserve(new_skb, 34);
+			skb_reserve(new_skb, NET_IP_ALIGN);
 			this->rx_addr =
 				dma_map_single(&bp->bigmac_op->dev,
 					       new_skb->data,
-					       RX_BUF_ALLOC_SIZE - 34,
+					       RX_BUF_ALLOC_SIZE - NET_IP_ALIGN,
 					       DMA_FROM_DEVICE);
 			this->rx_flags =
-				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - 34) & RXD_LENGTH));
+				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - NET_IP_ALIGN) & RXD_LENGTH));
 
 			skb_put(skb, len);
 		} else {
@@ -870,7 +866,7 @@ static void bigmac_rx(struct bigmac *bp)
 
 			/* Reuse original ring buffer. */
 			this->rx_flags =
-				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - 34) & RXD_LENGTH));
+				(RXD_OWN | ((RX_BUF_ALLOC_SIZE - NET_IP_ALIGN) & RXD_LENGTH));
 
 			skb = copy_skb;
 		}
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 11/46] net: sungem: cleanup RX skb allocation
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/sungem.c |   34 +++++++++++-----------------------
 drivers/net/sungem.h |    4 +++-
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index ade35dd..e82617f 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -743,21 +743,6 @@ static __inline__ void gem_post_rxds(struct gem *gp, int limit)
 	}
 }
 
-#define ALIGNED_RX_SKB_ADDR(addr) \
-        ((((unsigned long)(addr) + (64UL - 1UL)) & ~(64UL - 1UL)) - (unsigned long)(addr))
-static __inline__ struct sk_buff *gem_alloc_skb(struct net_device *dev, int size,
-						gfp_t gfp_flags)
-{
-	struct sk_buff *skb = alloc_skb(size + 64, gfp_flags);
-
-	if (likely(skb)) {
-		unsigned long offset = ALIGNED_RX_SKB_ADDR(skb->data);
-		skb_reserve(skb, offset);
-		skb->dev = dev;
-	}
-	return skb;
-}
-
 static int gem_rx(struct gem *gp, int work_to_do)
 {
 	struct net_device *dev = gp->dev;
@@ -821,7 +806,10 @@ static int gem_rx(struct gem *gp, int work_to_do)
 		if (len > RX_COPY_THRESHOLD) {
 			struct sk_buff *new_skb;
 
-			new_skb = gem_alloc_skb(dev, RX_BUF_ALLOC_SIZE(gp), GFP_ATOMIC);
+			new_skb = __netdev_alloc_skb_aligned(dev,
+							     RX_BUF_ALLOC_SIZE(gp),
+							     SUNGEM_RX_ALIGNMENT,
+							     GFP_ATOMIC);
 			if (new_skb == NULL) {
 				drops++;
 				goto drop_it;
@@ -830,7 +818,6 @@ static int gem_rx(struct gem *gp, int work_to_do)
 				       RX_BUF_ALLOC_SIZE(gp),
 				       PCI_DMA_FROMDEVICE);
 			gp->rx_skbs[entry] = new_skb;
-			skb_put(new_skb, (gp->rx_buf_sz + RX_OFFSET));
 			rxd->buffer = cpu_to_le64(pci_map_page(gp->pdev,
 							       virt_to_page(new_skb->data),
 							       offset_in_page(new_skb->data),
@@ -838,17 +825,16 @@ static int gem_rx(struct gem *gp, int work_to_do)
 							       PCI_DMA_FROMDEVICE));
 			skb_reserve(new_skb, RX_OFFSET);
 
-			/* Trim the original skb for the netif. */
-			skb_trim(skb, len);
+			skb_put(skb, len);
 		} else {
-			struct sk_buff *copy_skb = netdev_alloc_skb(dev, len + 2);
+			struct sk_buff *copy_skb =
+				netdev_alloc_skb_ip_align(dev, len);
 
 			if (copy_skb == NULL) {
 				drops++;
 				goto drop_it;
 			}
 
-			skb_reserve(copy_skb, 2);
 			skb_put(copy_skb, len);
 			pci_dma_sync_single_for_cpu(gp->pdev, dma_addr, len, PCI_DMA_FROMDEVICE);
 			skb_copy_from_linear_data(skb, copy_skb->data, len);
@@ -1637,7 +1623,10 @@ static void gem_init_rings(struct gem *gp)
 		struct sk_buff *skb;
 		struct gem_rxd *rxd = &gb->rxd[i];
 
-		skb = gem_alloc_skb(dev, RX_BUF_ALLOC_SIZE(gp), GFP_KERNEL);
+		skb = __netdev_alloc_skb_aligned(dev,
+						 RX_BUF_ALLOC_SIZE(gp),
+						 SUNGEM_RX_ALIGNMENT,
+						 GFP_KERNEL);
 		if (!skb) {
 			rxd->buffer = 0;
 			rxd->status_word = 0;
@@ -1645,7 +1634,6 @@ static void gem_init_rings(struct gem *gp)
 		}
 
 		gp->rx_skbs[i] = skb;
-		skb_put(skb, (gp->rx_buf_sz + RX_OFFSET));
 		dma_addr = pci_map_page(gp->pdev,
 					virt_to_page(skb->data),
 					offset_in_page(skb->data),
diff --git a/drivers/net/sungem.h b/drivers/net/sungem.h
index 835ce1b..0d486ce 100644
--- a/drivers/net/sungem.h
+++ b/drivers/net/sungem.h
@@ -935,7 +935,9 @@ struct gem_rxd {
 	  (GP)->tx_old - (GP)->tx_new - 1)
 
 #define RX_OFFSET          2
-#define RX_BUF_ALLOC_SIZE(gp)	((gp)->rx_buf_sz + 28 + RX_OFFSET + 64)
+#define SUNGEM_RX_ALIGNMENT 64		/* min: cache line size, see comment above */
+#define RX_BUF_ALLOC_SIZE(gp)	\
+	ALIGN((gp)->rx_buf_sz + RX_OFFSET, SUNGEM_RX_ALIGNMENT)
 
 #define RX_COPY_THRESHOLD  256
 
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 12/46] net: sunhme: cleanup RX skb allocation
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/sunhme.c |   18 ++++++++----------
 drivers/net/sunhme.h |   14 +-------------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c
index 856e05b..c73fdad 100644
--- a/drivers/net/sunhme.c
+++ b/drivers/net/sunhme.c
@@ -1265,7 +1265,8 @@ static void happy_meal_init_rings(struct happy_meal *hp)
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		struct sk_buff *skb;
 
-		skb = happy_meal_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
+		skb = __netdev_alloc_skb_aligned(dev, RX_BUF_ALLOC_SIZE,
+			SUNHME_RX_ALIGNMENT, GFP_ATOMIC);
 		if (!skb) {
 			hme_write_rxd(hp, &hb->happy_meal_rxd[i], 0, 0);
 			continue;
@@ -1273,8 +1274,6 @@ static void happy_meal_init_rings(struct happy_meal *hp)
 		hp->rx_skbs[i] = skb;
 		skb->dev = dev;
 
-		/* Because we reserve afterwards. */
-		skb_put(skb, (ETH_FRAME_LEN + RX_OFFSET + 4));
 		hme_write_rxd(hp, &hb->happy_meal_rxd[i],
 			      (RXFLAG_OWN | ((RX_BUF_ALLOC_SIZE - RX_OFFSET) << 16)),
 			      dma_map_single(hp->dma_dev, skb->data, RX_BUF_ALLOC_SIZE,
@@ -2025,32 +2024,31 @@ static void happy_meal_rx(struct happy_meal *hp, struct net_device *dev)
 			struct sk_buff *new_skb;
 
 			/* Now refill the entry, if we can. */
-			new_skb = happy_meal_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
+			new_skb = __netdev_alloc_skb_aligned(dev,
+							     RX_BUF_ALLOC_SIZE,
+							     SUNHME_RX_ALIGNMENT,
+							     GFP_ATOMIC);
 			if (new_skb == NULL) {
 				drops++;
 				goto drop_it;
 			}
 			dma_unmap_single(hp->dma_dev, dma_addr, RX_BUF_ALLOC_SIZE, DMA_FROM_DEVICE);
 			hp->rx_skbs[elem] = new_skb;
-			new_skb->dev = dev;
-			skb_put(new_skb, (ETH_FRAME_LEN + RX_OFFSET + 4));
 			hme_write_rxd(hp, this,
 				      (RXFLAG_OWN|((RX_BUF_ALLOC_SIZE-RX_OFFSET)<<16)),
 				      dma_map_single(hp->dma_dev, new_skb->data, RX_BUF_ALLOC_SIZE,
 						     DMA_FROM_DEVICE));
 			skb_reserve(new_skb, RX_OFFSET);
 
-			/* Trim the original skb for the netif. */
-			skb_trim(skb, len);
+			skb_put(skb, len);
 		} else {
-			struct sk_buff *copy_skb = dev_alloc_skb(len + 2);
+			struct sk_buff *copy_skb = netdev_alloc_skb_ip_align(dev, len);
 
 			if (copy_skb == NULL) {
 				drops++;
 				goto drop_it;
 			}
 
-			skb_reserve(copy_skb, 2);
 			skb_put(copy_skb, len);
 			dma_sync_single_for_cpu(hp->dma_dev, dma_addr, len, DMA_FROM_DEVICE);
 			skb_copy_from_linear_data(skb, copy_skb->data, len);
diff --git a/drivers/net/sunhme.h b/drivers/net/sunhme.h
index 64f2783..f584eb0 100644
--- a/drivers/net/sunhme.h
+++ b/drivers/net/sunhme.h
@@ -495,18 +495,6 @@ struct quattro {
 	int			  nranges;
 };
 
-/* We use this to acquire receive skb's that we can DMA directly into. */
-#define ALIGNED_RX_SKB_ADDR(addr) \
-        ((((unsigned long)(addr) + (64UL - 1UL)) & ~(64UL - 1UL)) - (unsigned long)(addr))
-#define happy_meal_alloc_skb(__length, __gfp_flags) \
-({	struct sk_buff *__skb; \
-	__skb = alloc_skb((__length) + 64, (__gfp_flags)); \
-	if(__skb) { \
-		int __offset = (int) ALIGNED_RX_SKB_ADDR(__skb->data); \
-		if(__offset) \
-			skb_reserve(__skb, __offset); \
-	} \
-	__skb; \
-})
+#define SUNHME_RX_ALIGNMENT 64
 
 #endif /* !(_SUNHME_H) */
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 15/46] net/wireless: b43: use kfree_skb() for untouched skbs
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev; +Cc: Stefano Brivio, John W. Linville, linux-wireless
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/wireless/b43/dma.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/b43/dma.c b/drivers/net/wireless/b43/dma.c
index 15b11f0..ed78f14 100644
--- a/drivers/net/wireless/b43/dma.c
+++ b/drivers/net/wireless/b43/dma.c
@@ -592,7 +592,7 @@ static int setup_rx_descbuffer(struct b43_dmaring *ring,
 		/* ugh. try to realloc in zone_dma */
 		gfp_flags |= GFP_DMA;
 
-		dev_kfree_skb_any(skb);
+		kfree_skb(skb);
 
 		skb = __dev_alloc_skb(ring->rx_buffersize, gfp_flags);
 		if (unlikely(!skb))
@@ -602,7 +602,7 @@ static int setup_rx_descbuffer(struct b43_dmaring *ring,
 					 ring->rx_buffersize, 0);
 		if (b43_dma_mapping_error(ring, dmaaddr, ring->rx_buffersize, 0)) {
 			b43err(ring->dev->wl, "RX DMA buffer allocation failed\n");
-			dev_kfree_skb_any(skb);
+			kfree_skb(skb);
 			return -EIO;
 		}
 	}
@@ -645,7 +645,7 @@ static int alloc_initial_descbuffers(struct b43_dmaring *ring)
 		desc = ring->ops->idx2desc(ring, i, &meta);
 
 		unmap_descbuffer(ring, meta->dmaaddr, ring->rx_buffersize, 0);
-		dev_kfree_skb(meta->skb);
+		kfree_skb(meta->skb);
 	}
 	goto out;
 }
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH v2 46/46] net: mark drivers that drop packets from rx queue head under memory pressure
From: Michał Mirosław @ 2011-07-11  0:52 UTC (permalink / raw)
  To: netdev
  Cc: Hartley Sweeten, Michael Chan, Eilon Greenstein, Guo-Fu Tseng,
	Realtek linux nic maintainers, Francois Romieu, Stephen Hemminger,
	Matt Carlson, Jon Mason
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/arm/ep93xx_eth.c    |    3 +++
 drivers/net/bnx2.c              |    3 +++
 drivers/net/bnx2x/bnx2x_cmn.c   |    3 +++
 drivers/net/cassini.c           |    3 +++
 drivers/net/jme.c               |    3 +++
 drivers/net/mlx4/en_rx.c        |    6 ++++++
 drivers/net/r8169.c             |    3 +++
 drivers/net/skge.c              |    3 +++
 drivers/net/sky2.c              |    2 ++
 drivers/net/tg3.c               |    2 ++
 drivers/net/tokenring/olympic.c |    2 ++
 drivers/net/vxge/vxge-main.c    |    3 +++
 12 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index ba3bf43..55a42c0 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -282,6 +282,9 @@ static int ep93xx_rx(struct net_device *dev, int processed, int budget)
 		if (rstat0 & RSTAT0_CRCI)
 			length -= 4;
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 		skb = dev_alloc_skb(length + 2);
 		if (likely(skb != NULL)) {
 			struct ep93xx_rdesc *rxd = &ep->descs->rdesc[entry];
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index d627886..14f9a5f 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2952,6 +2952,9 @@ bnx2_rx_skb(struct bnx2 *bp, struct bnx2_rx_ring_info *rxr, struct sk_buff *skb,
 	int err;
 	u16 prod = ring_idx & 0xffff;
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	err = bnx2_alloc_rx_skb(bp, rxr, prod, GFP_ATOMIC);
 	if (unlikely(err)) {
 		bnx2_reuse_rx_skb(bp, rxr, skb, (u16) (ring_idx >> 16), prod);
diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 4f9164c..a6da01a 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -673,6 +673,9 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 				goto reuse_rx;
 			}
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 			/* Since we don't have a jumbo ring
 			 * copy small packets if mtu > 1500
 			 */
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 788ab13..a05a490 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -1975,6 +1975,9 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
 	else
 		alloclen = max(hlen, RX_COPY_MIN);
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	skb = dev_alloc_skb(alloclen + swivel + cp->crc_size);
 	if (skb == NULL)
 		return -1;
diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index ad69dae..e9ac9bd 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -1022,6 +1022,9 @@ jme_alloc_and_feed_skb(struct jme_adapter *jme, int idx)
 	dma_unmap_single(&jme->pdev->dev, rxbi->mapping, rxbi->len,
 			 DMA_FROM_DEVICE);
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	if (unlikely(jme_make_new_rx_buf(jme, idx))) {
 		rxbi->mapping = dma_map_single(&jme->pdev->dev, skb->data,
 						rxbi->len, DMA_FROM_DEVICE);
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index ee15295..e2baa3f 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -413,6 +413,9 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 	int nr;
 	dma_addr_t dma;
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	/* Collect used fragments while replacing them in the HW descirptors */
 	for (nr = 0; nr < priv->num_frags; nr++) {
 		frag_info = &priv->frag_info[nr];
@@ -462,6 +465,9 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 	int used_frags;
 	dma_addr_t dma;
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	skb = dev_alloc_skb(SMALL_PACKET_SIZE + NET_IP_ALIGN);
 	if (!skb) {
 		en_dbg(RX_ERR, priv, "Failed allocating skb\n");
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index e2c2884..ce4bdaf 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -4999,6 +4999,9 @@ static struct sk_buff *rtl8169_try_rx_copy(void *data,
 	struct sk_buff *skb;
 	struct device *d = &tp->pci_dev->dev;
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	data = rtl8169_align(data);
 	dma_sync_single_for_cpu(d, addr, pkt_size, DMA_FROM_DEVICE);
 	prefetch(data);
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 11e5229..79bf015 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -3022,6 +3022,9 @@ static struct sk_buff *skge_rx_get(struct net_device *dev,
 	if (phy_length(skge->hw, status) != len)
 		goto error;
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 	if (len < RX_COPY_THRESHOLD) {
 		skb = netdev_alloc_skb_ip_align(dev, len);
 		if (!skb)
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 5f720b9..aaf56e4 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -2540,6 +2540,8 @@ okay:
 		skb = receive_copy(sky2, re, length);
 	else
 		skb = receive_new(sky2, re, length);
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
 
 	dev->stats.rx_dropped += (skb == NULL);
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index b43d473..42e0d31 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -4973,6 +4973,8 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
 
 			skb_size = tg3_alloc_rx_skb(tp, tpr, opaque_key,
 						    *post_ptr);
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
 			if (skb_size < 0)
 				goto drop_it;
 
diff --git a/drivers/net/tokenring/olympic.c b/drivers/net/tokenring/olympic.c
index 30fb6e8..e0c3cca 100644
--- a/drivers/net/tokenring/olympic.c
+++ b/drivers/net/tokenring/olympic.c
@@ -797,6 +797,8 @@ static void olympic_rx(struct net_device *dev)
 				}
 
 				if (skb == NULL) {
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
 					printk(KERN_WARNING "%s: Not enough memory to copy packet to upper layers.\n",dev->name) ;
 					dev->stats.rx_dropped++;
 					/* Update counters even though we don't transfer the frame */
diff --git a/drivers/net/vxge/vxge-main.c b/drivers/net/vxge/vxge-main.c
index 00d435d..00a9003 100644
--- a/drivers/net/vxge/vxge-main.c
+++ b/drivers/net/vxge/vxge-main.c
@@ -414,6 +414,9 @@ vxge_rx_1b_compl(struct __vxge_hw_ring *ringh, void *dtr,
 			}
 		}
 
+#warning drops packets from rx queue head on memory pressure
+#warning (like dev_skb_finish_rx_dma_refill() users)
+
 		if (pkt_length > VXGE_LL_RX_COPY_THRESHOLD) {
 			if (vxge_rx_alloc(dtr, ring, data_size) != NULL) {
 				if (!vxge_rx_map(dtr, ring)) {
-- 
1.7.5.4


^ permalink raw reply related

* [PATCH net-next-2.6] net: introduce build_skb()
From: Eric Dumazet @ 2011-07-11  5:46 UTC (permalink / raw)
  To: Michał Mirosław; +Cc: netdev
In-Reply-To: <ae7b531c89a01a21e4374907b69f4d997c9d5d1b.1310339688.git.mirq-linux@rere.qmqm.pl>

Le lundi 11 juillet 2011 à 02:52 +0200, Michał Mirosław a écrit :
> Introduce __netdev_alloc_skb_aligned() to return skb with skb->data
> aligned at specified 2^n multiple.
> 
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> ---

Hi Michal


Could we synchronize our work to not introduce things that might
disappear shortly ?

Here is the RFC patch about build_skb() :

[PATCH] net: introduce build_skb()

One of the thing we discussed during netdev 2011 conference was the idea
to change network drivers to allocate/populate their skb at RX
completion time, right before feeding the skb to network stack.

Right now, we allocate skbs when populating the RX ring, and thats a
waste of CPU cache, since allocating skb means a full memset() to clear
the skb and its skb_shared_info portion. By the time NIC fills a frame
in data buffer and host can get it, cpu probably threw away the cache
lines from its caches, because of huge RX ring sizes.

So the deal would be to allocate only the data buffer for the NIC to
populate its RX ring buffer. And use build_skb() at RX completion to
attach a data buffer (now filled with an ethernet frame) to a new skb,
initialize the skb_shared_info portion, and give the hot skb to network
stack.

build_skb() is the function to allocate an skb, caller providing the
data buffer that should be attached to it. Drivers are expected to call 
skb_reserve() right after build_skb() to let skb->data points to the
Ethernet frame (usually skipping NET_SKB_PAD and NET_IP_ALIGN)


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/skbuff.h |    1 
 net/core/skbuff.c      |   48 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 32ada53..5e903e7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -507,6 +507,7 @@ static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
+extern struct sk_buff *build_skb(void *data, unsigned int size);
 extern struct sk_buff *__alloc_skb(unsigned int size,
 				   gfp_t priority, int fclone, int node);
 static inline struct sk_buff *alloc_skb(unsigned int size,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d220119..9193d7e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -234,6 +234,54 @@ nodata:
 EXPORT_SYMBOL(__alloc_skb);
 
 /**
+ * build_skb - build a network buffer
+ * @data: data buffer provider by caller
+ * @size: size of data buffer, not including skb_shared_info
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. Mostly used in driver RX path.
+ * The return is the buffer. On a failure the return is %NULL.
+ * Notes :
+ *  Before IO, driver allocates only data buffer where NIC put incoming frame
+ *  Driver SHOULD add room at head (NET_SKB_PAD) and
+ *  MUST add room tail (to hold skb_shared_info)
+ *  After IO, driver calls build_skb(), to get a hot skb instead of a cold one
+ *  before giving packet to stack. RX rings only contains data buffers, not
+ *  full skbs.
+ */
+struct sk_buff *build_skb(void *data, unsigned int size)
+{
+	struct skb_shared_info *shinfo;
+	struct sk_buff *skb;
+
+	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	size = SKB_DATA_ALIGN(size);
+
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->truesize = size + sizeof(struct sk_buff);
+	atomic_set(&skb->users, 1);
+	skb->head = data;
+	skb->data = data;
+	skb_reset_tail_pointer(skb);
+	skb->end = skb->tail + size;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->mac_header = ~0U;
+#endif
+
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+	kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+	return skb;
+}
+EXPORT_SYMBOL(build_skb);
+
+/**
  *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
  *	@length: length to allocate



^ permalink raw reply related

* Re: [PATCH v2 46/46] net: mark drivers that drop packets from rx queue head under memory pressure
From: Francois Romieu @ 2011-07-11  5:40 UTC (permalink / raw)
  To: Michał Mirosław; +Cc: netdev, Realtek linux nic maintainers
In-Reply-To: <c84a04a957128ae664f4a80da23fad4b9f71a85f.1310339689.git.mirq-linux@rere.qmqm.pl>

Michał Mirosław <mirq-linux@rere.qmqm.pl> :
[...]
> diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
> index e2c2884..ce4bdaf 100644
> --- a/drivers/net/r8169.c
> +++ b/drivers/net/r8169.c
> @@ -4999,6 +4999,9 @@ static struct sk_buff *rtl8169_try_rx_copy(void *data,
>  	struct sk_buff *skb;
>  	struct device *d = &tp->pci_dev->dev;
>  
> +#warning drops packets from rx queue head on memory pressure
> +#warning (like dev_skb_finish_rx_dma_refill() users)
> +
>  	data = rtl8169_align(data);
>  	dma_sync_single_for_cpu(d, addr, pkt_size, DMA_FROM_DEVICE);
>  	prefetch(data);

The commit messages explain why the driver works this way. I'd rather avoid the
noise and - especially - the dubious patches it may generate.

-- 
Ueimor

^ permalink raw reply

* Re: [RFC PATCH 1/1] BPF JIT for PPC64
From: Matt Evans @ 2011-07-11  6:21 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, linuxppc-dev
In-Reply-To: <1308967114.3093.1379.camel@localhost>

On 25/06/11 11:58, Ben Hutchings wrote:
> On Fri, 2011-06-24 at 16:02 +1000, Matt Evans wrote:
> [...]
>> +		case BPF_S_ALU_ADD_K: /* A += K; */
>> +			if (!K)
>> +				break;
>> +			if (K < 32768)
>> +				PPC_ADDI(r_A, r_A, K);
>> +			else
>> +				PPC_ADDI(r_A, r_A, IMM_L(K));
>> +				PPC_ADDIS(r_A, r_A, IMM_HA(K));
>> +			break;
> 
> Missing braces.
> 
>> +		case BPF_S_ALU_SUB_X: /* A -= X; */
>> +			ctx->seen |= SEEN_XREG;
>> +			PPC_SUB(r_A, r_A, r_X);
>> +			break;
>> +		case BPF_S_ALU_SUB_K: /* A -= K */
>> +			if (!K)
>> +				break;
>> +			if (K < 32768)
>> +				PPC_ADDI(r_A, r_A, -K);
>> +			else
>> +				PPC_ADDI(r_A, r_A, IMM_L(-K));
>> +				PPC_ADDIS(r_A, r_A, IMM_HA(-K));
>> +			break;
> [...]
> 
> Here as well.

Thanks, Ben -- oops! :)  Really, just the ADDISes need to be conditional, too.


Cheers,


Matt

^ permalink raw reply

* Re: [RFC PATCH 1/1] BPF JIT for PPC64
From: Matt Evans @ 2011-07-11  6:27 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: netdev, linuxppc-dev
In-Reply-To: <m2boxmxt45.fsf@igel.home>

On 25/06/11 17:33, Andreas Schwab wrote:
> Matt Evans <matt@ozlabs.org> writes:
> 
>> +	stdu	r1, -128(r1);					\
> 
>> +	addi	r5, r1, 128+BPF_PPC_STACK_BASIC+(2*8);		\
> 
>> +	addi	r1, r1, 128;					\
> 
>> +					PPC_STD(r_M + i, 1, -128 + (8*i));
> 
>> +					PPC_LD(r_M + i, 1, -128 + (8*i));
> 
> s/128/BPF_PPC_STACK_SAVE/?

Actually, that's a different 128, but that nicely illustrates that I should've
#defined something more recognisable :-) The second set, with -128, is actually
in the save area for non-volatile regs, whereas the first is just a stackframe
size.

Cheers,

Matt

^ permalink raw reply

* Re: [RFC 43/72] a2065/ariadne: Move the a2065/ariadne drivers
From: Geert Uytterhoeven @ 2011-07-11  6:33 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: davem@davemloft.net, netdev@vger.kernel.org
In-Reply-To: <1310345295.26989.76.camel@jtkirshe-mobl>

On Mon, Jul 11, 2011 at 02:48, Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:
> On Sun, 2011-07-10 at 12:34 -0700, Geert Uytterhoeven wrote:
>> On Sat, Jul 9, 2011 at 16:30, Jeff Kirsher
>> <jeffrey.t.kirsher@intel.com> wrote:
>> > On Tue, 2011-06-28 at 13:33 -0700, Geert Uytterhoeven wrote:
>> >> And (in some other patch) 82596.c is an Intel driver, not a
>> Motorola driver.
>> >
>> > 82596.c is not an Intel driver, it is an Intel part.  The driver was
>>
>> Sorry, I meant "driver for an Intel part".
>>
>> > written and support by someone other than Intel.  I am looking at
>> how to
>>
>> Sure. But I'm strongly against classifying drivers based on who wrote
>> them ;-)
>
> I agree to some extent, because if that were the case, we would have a
> donald_becker/ directory for several of the drivers. :)
>
> Here is one of the problem's I keep running into and there is no simple
> answer.  While most of the drivers can be grouped together by the
> hardware they use, that does not work "logically" for every driver.
>
> In addition, if vendor 'A' makes a part and vendor 'B' uses same part in
> a device/system/NIC and vendor 'B' creates the driver, supports the
> driver and maintains the driver.  Should the part be categorized under
> vendor 'A'?  IMHO, I think it should be categorized as a vendor 'B'
> driver.

Several of the Ethernet drivers are of a third type: vendor A chip, vendor B
card, entity C software.

> I started this work with the idea of trying to organize the drivers in
> the same way that the drivers were to be in the Kconfig, which tended to
> be drivers/net/ethernet/<manufacturer>.
>
> One of the problems that arise in this organization is what do we do
> when vendor A is bought by vendor B, and vendor B takes on the support
> of all the old vendor A parts/drivers?

We don't care. We don't sort drivers by who support them. Eventually, vendors
lose interest and they all end up under "Linux kernel community" anyway.

> So I am open to suggestions.  The process I have using to organize the
> drivers has been to group drivers that use common libraries and/or code
> first, then group by either manufacturer, maintainer, or common
> platform.
>
> I would like to keep the lasi_82506.c, sni_82596.c, 82506.c and similar
> drivers out of the intel/ directory because we would not be supporting
> the drivers and they are not similar to our drivers that we do support
> that would be in the intel/ directory.
>
> Again, I open to suggestions on how to best organize these types of
> drivers.  Maybe create a misc/ or <bus_type>/ for these types of
> drivers?

"Similar" drivers should be together and consolidated (if someone has time
to do it). They can even be of different brands.
I.e. not all Tulip-compatibles were manufactured by Digital or Intel.

>> > better organize the 82596.c, lasi_82596.c, lib82596.c, and
>> sni_82596.c
>> > which all use an Intel Ethernet chip but were written and supported
>> by
>> > someone other than Intel.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH v2 46/46] net: mark drivers that drop packets from rx queue head under memory pressure
From: Eilon Greenstein @ 2011-07-11  6:47 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: netdev@vger.kernel.org, Hartley Sweeten, Michael Chan,
	Guo-Fu Tseng, Realtek linux nic maintainers, Francois Romieu,
	Stephen Hemminger, Matthew Carlson, Jon Mason
In-Reply-To: <c84a04a957128ae664f4a80da23fad4b9f71a85f.1310339689.git.mirq-linux@rere.qmqm.pl>

On Sun, 2011-07-10 at 17:52 -0700, Michał Mirosław wrote:
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> ---

> diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
> index 4f9164c..a6da01a 100644
> --- a/drivers/net/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/bnx2x/bnx2x_cmn.c
> @@ -673,6 +673,9 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
>  				goto reuse_rx;
>  			}
>  
> +#warning drops packets from rx queue head on memory pressure
> +#warning (like dev_skb_finish_rx_dma_refill() users)
> +

We have the dropless_fc module parameter that can be configured if the
user prefers pausing on host memory pressure - the problem with that
feature is that it is enough that one of the ring runs out of memory and
the entire port is stopped. When running with 16 rings, this can lead to
serious throughput degradation - this is why it is kept as a user
configurable option.

>  			/* Since we don't have a jumbo ring
>  			 * copy small packets if mtu > 1500
>  			 */




^ permalink raw reply

* Re: [PATCH v2 00/46] Clean up RX copybreak and DMA handling
From: David Miller @ 2011-07-11  6:54 UTC (permalink / raw)
  To: mirq-linux; +Cc: netdev
In-Reply-To: <cover.1310339688.git.mirq-linux@rere.qmqm.pl>

From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Mon, 11 Jul 2011 02:52:46 +0200 (CEST)

>   1. under packet storm and memory pressure NIC keeps generating interrupts
>      (if non-NAPI) and indicating new buffers because it always has free
>      RX buffers --- this only wastes CPU and bus bandwidth transferring
>      data that is going to be immediately discarded;

Actually, this is exactly how I, and others advise people to implement
drivers.  It is the right thing to do.

The worst thing that can happen is to let the RX ring empty of
buffers.  Some cards hang as a result of this, and also it causes head
of line blocking on multiqueue cards, etc.

So the first thing the driver should do is try to allocate a
replacement buffer.

And if that fails, it should give the RX packet right back to the
card, and not pass it up the stack.

^ permalink raw reply

* Re: [RFC PATCH 1/1] BPF JIT for PPC64
From: Matt Evans @ 2011-07-11  7:04 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linuxppc-dev, netdev
In-Reply-To: <1308988180.2532.27.camel@edumazet-laptop>

Hi Eric,

On 25/06/11 17:49, Eric Dumazet wrote:
> Le samedi 25 juin 2011 à 09:33 +0200, Andreas Schwab a écrit :
>> Matt Evans <matt@ozlabs.org> writes:
>>
>>> +	stdu	r1, -128(r1);					\
>>
>>> +	addi	r5, r1, 128+BPF_PPC_STACK_BASIC+(2*8);		\
>>
>>> +	addi	r1, r1, 128;					\
>>
>>> +					PPC_STD(r_M + i, 1, -128 + (8*i));
>>
>>> +					PPC_LD(r_M + i, 1, -128 + (8*i));
>>
>> s/128/BPF_PPC_STACK_SAVE/?
>>
> 
> I am not sure using registers to hold MEM[] is a win if MEM[idx] is used
> once in the filter
> 
> # tcpdump "tcp[20]+tcp[21]=0" -d
> (000) ldh      [12]
> (001) jeq      #0x800           jt 2	jf 15
> (002) ldb      [23]
> (003) jeq      #0x6             jt 4	jf 15
> (004) ldh      [20]
> (005) jset     #0x1fff          jt 15	jf 6
> (006) ldxb     4*([14]&0xf)
> (007) ldb      [x + 34]
> (008) st       M[1]
> (009) ldb      [x + 35]
> (010) tax      
> (011) ld       M[1]
> (012) add      x
> (013) jeq      #0x0             jt 14	jf 15
> (014) ret      #65535
> (015) ret      #0
> 
> In this sample, we use M[1] once ( one store, one load)
> 
> So saving previous register content on stack in prologue, and restoring
> it in epilogue actually slow down the code, and adds two instructions in
> filter asm code.

The x86 version generates all accesses of M[x] as a load or store to the
stackframe, so your example would result in one store + one load to/from the
stack.  More than one access would result in N stores/loads.  By having M[] live
in r16-31 on PPC, there is a one-off cost of saving/restoring the (non-volatile)
register to the stack plus a per-use cost of a register-register move (MR, which
is pretty cheap compared to loads/stores!).

You are correct in that, for a single store/load of M[1] above, I'll generate
a STD, MR, MR, LD, but this extra cost of the two MRs is pretty small.  With the
current implementation the gains seen when accessing M[x] /more/ than once will
IMHO more than justify this.  (For several M[x] accesses, x86 would have several
mem ops, PPC would have several reg-reg moves and one load+store.)

An obvious alternative would be to use some of the three free volatile registers
first (in the hope that "most" filters won't use >3 M[] locations), with a
simple register allocator.  This would save the (non-volatile reg) spill/fill to
stack, too.  In the interest of simplicity I didn't want to do that on a first
pass.

> This also makes epilogue code not easy (not possible as a matter of fact)
> to unwind in helper function
> 
> In x86_64 implementation, I chose bpf_error be able to force
> an exception, not returning to JIT code but directly to bpf_func() caller
> 
> bpf_error:
> # force a return 0 from jit handler
>         xor             %eax,%eax
>         mov             -8(%rbp),%rbx
>         leaveq
>         ret

Yep, if I use non-volatile regs a return isn't just a simple stack "pop".
Currently, I've an extra branch in the return path to hit the common epilogue.
This could be optimised such that the out of line error path jumps directly to
the common epilogue to return (rather than back to the callsite, checking a flag
and /then/ to the epilogue) to speed up the non-error case.  However, it's just
a question of getting to the (existing) epilogue code to clean up; it doesn't
need to be unwound in the helper function.  I don't think this issue is a
strong argument against having M[] exist in registers, though.

From the current position, I think going in the direction of using volatile regs
(without backup/restore cost) is better than going in the direction of making
all M[] references stack accesses.  Do you think it's bearable to continue as it
is and then perform that optimisation later?

Also, thanks for reading/commenting on the patch!

Cheers,

Matt

^ permalink raw reply

* Re: [PATCH v2 03/46] net drivers: remove unnecessary dma_sync_to_device(DMA_FROM_DEVICE)
From: Vlad Zolotarov @ 2011-07-11  8:30 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: linux-wireless@vger.kernel.org, Eilon Greenstein, Gary Zambrano,
	Stephen Hemminger, Stefano Brivio,
	e1000-devel@lists.sourceforge.net, Matthew Carlson,
	Jesse Brandeburg, Francois Romieu, Realtek linux nic maintainers,
	John W. Linville, Ron Mercer, Michael Chan, Jitendra Kalsaria,
	Divy Le Ray, netdev@vger.kernel.org, Bruce Allan, Hartley Sweeten,
	John Ronciak, Jon 
In-Reply-To: <6f2d2a58a4932cd93a0291fac9636d3681d69031.1310339688.git.mirq-linux@rere.qmqm.pl>

>         prod_rx_buf->skb = skb;
> diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h
> index c016e20..c9e49a0 100644
> --- a/drivers/net/bnx2x/bnx2x_cmn.h
> +++ b/drivers/net/bnx2x/bnx2x_cmn.h
> @@ -923,16 +923,11 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x
> *bp, static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
>                                       u16 cons, u16 prod)
>  {
> -       struct bnx2x *bp = fp->bp;
>         struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
>         struct sw_rx_bd *prod_rx_buf = &fp->rx_buf_ring[prod];
>         struct eth_rx_bd *cons_bd = &fp->rx_desc_ring[cons];
>         struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
> 
> -       dma_sync_single_for_device(&bp->pdev->dev,
> -                                  dma_unmap_addr(cons_rx_buf, mapping),
> -                                  RX_COPY_THRESH, DMA_FROM_DEVICE);
> -
>         dma_unmap_addr_set(prod_rx_buf, mapping,
>                            dma_unmap_addr(cons_rx_buf, mapping));
>         prod_rx_buf->skb = cons_rx_buf->skb;

Michal, pls., note that this function is only called for buffers which were 
previously dma_synced towards CPU (your "[PATCH v2 05/46] net: bnx2x: fix DMA 
sync direction" properly fixes the direction of the first call which was 
incorrect). Then, according to the 3d edition of the "Linux device drivers" 
book, chapter 15, "Setting up streaming DMA mappings" article, end of the page 
449, when we call for dma_syc_single_for_cpu() the buffer ownership gets to 
the CPU and CPU may safely access the buffer (in particular, we read it). Then 
the author says: "Before the device accesses the buffer, however, ownership 
should be transfered back to it with: dma_sync_single_for_device().

The DMA-API.txt document u've referenced doesn't refer the above function, so, 
it's unclear how your fix may be based on it. On the other hand it clearly 
contradicts the "Linux device driver" book.

Pls., comment.

thanks,
vlad


------------------------------------------------------------------------------
All of the data generated in your IT infrastructure is seriously valuable.
Why? It contains a definitive record of application performance, security 
threats, fraudulent activity, and more. Splunk takes this data and makes 
sense of it. IT sense. And common sense.
http://p.sf.net/sfu/splunk-d2d-c2
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* [PATCH 2/2] ipv4: Use universal hash for ARP.
From: David Miller @ 2011-07-11  8:48 UTC (permalink / raw)
  To: roland; +Cc: johnwheffner, mj, netdev


We need to make sure the multiplier is odd.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h    |    7 +++++++
 net/core/neighbour.c |    1 +
 net/ipv4/arp.c       |    3 +--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/net/arp.h b/include/net/arp.h
index 91f0568..723bde5 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -8,6 +8,13 @@
 
 extern struct neigh_table arp_tbl;
 
+static inline u32 arp_hashfn(u32 key, const struct net_device *dev, u32 hash_rnd)
+{
+	u32 val = key ^ dev->ifindex;
+
+	return val * hash_rnd;
+}
+
 extern void	arp_init(void);
 extern int	arp_find(unsigned char *haddr, struct sk_buff *skb);
 extern int	arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 4d5fc94..50bd960 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -334,6 +334,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
 	ret->hash_buckets = buckets;
 	ret->hash_shift = shift;
 	get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
+	ret->hash_rnd |= 1;
 	return ret;
 }
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1b74d3b..4412b57 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -97,7 +97,6 @@
 #include <linux/init.h>
 #include <linux/net.h>
 #include <linux/rcupdate.h>
-#include <linux/jhash.h>
 #include <linux/slab.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -232,7 +231,7 @@ static u32 arp_hash(const void *pkey,
 		    const struct net_device *dev,
 		    __u32 hash_rnd)
 {
-	return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
+	return arp_hashfn(*(u32 *)pkey, dev, hash_rnd);
 }
 
 static int arp_constructor(struct neighbour *neigh)
-- 
1.7.6


^ permalink raw reply related

* [PATCH 1/2] neigh: Store hash shift instead of mask.
From: David Miller @ 2011-07-11  8:48 UTC (permalink / raw)
  To: roland; +Cc: johnwheffner, mj, netdev


And mask the hash function result by simply shifting
down the "->hash_shift" most significant bits.

Currently which bits we use is arbitrary since jhash
produces entropy evenly across the whole hash function
result.

But soon we'll be using universal hashing functions,
and in those cases more entropy exists in the higher
bits than the lower bits, because they use multiplies.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |    2 +-
 net/core/neighbour.c    |   47 +++++++++++++++++++++++------------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 4014b62..6fe8c2c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -142,7 +142,7 @@ struct pneigh_entry {
 
 struct neigh_hash_table {
 	struct neighbour __rcu	**hash_buckets;
-	unsigned int		hash_mask;
+	unsigned int		hash_shift;
 	__u32			hash_rnd;
 	struct rcu_head		rcu;
 };
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ceb505b..4d5fc94 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -137,7 +137,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 	write_lock_bh(&tbl->lock);
 	nht = rcu_dereference_protected(tbl->nht,
 					lockdep_is_held(&tbl->lock));
-	for (i = 0; i <= nht->hash_mask; i++) {
+	for (i = 0; i < (1 << nht->hash_shift); i++) {
 		struct neighbour *n;
 		struct neighbour __rcu **np;
 
@@ -210,7 +210,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 	nht = rcu_dereference_protected(tbl->nht,
 					lockdep_is_held(&tbl->lock));
 
-	for (i = 0; i <= nht->hash_mask; i++) {
+	for (i = 0; i < (1 << nht->hash_shift); i++) {
 		struct neighbour *n;
 		struct neighbour __rcu **np = &nht->hash_buckets[i];
 
@@ -312,9 +312,9 @@ out_entries:
 	goto out;
 }
 
-static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
 {
-	size_t size = entries * sizeof(struct neighbour *);
+	size_t size = (1 << shift) * sizeof(struct neighbour *);
 	struct neigh_hash_table *ret;
 	struct neighbour __rcu **buckets;
 
@@ -332,7 +332,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
 		return NULL;
 	}
 	ret->hash_buckets = buckets;
-	ret->hash_mask = entries - 1;
+	ret->hash_shift = shift;
 	get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
 	return ret;
 }
@@ -342,7 +342,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
 	struct neigh_hash_table *nht = container_of(head,
 						    struct neigh_hash_table,
 						    rcu);
-	size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
+	size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
 	struct neighbour __rcu **buckets = nht->hash_buckets;
 
 	if (size <= PAGE_SIZE)
@@ -353,21 +353,20 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
 }
 
 static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
-						unsigned long new_entries)
+						unsigned long new_shift)
 {
 	unsigned int i, hash;
 	struct neigh_hash_table *new_nht, *old_nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, hash_grows);
 
-	BUG_ON(!is_power_of_2(new_entries));
 	old_nht = rcu_dereference_protected(tbl->nht,
 					    lockdep_is_held(&tbl->lock));
-	new_nht = neigh_hash_alloc(new_entries);
+	new_nht = neigh_hash_alloc(new_shift);
 	if (!new_nht)
 		return old_nht;
 
-	for (i = 0; i <= old_nht->hash_mask; i++) {
+	for (i = 0; i < (1 << old_nht->hash_shift); i++) {
 		struct neighbour *n, *next;
 
 		for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
@@ -377,7 +376,7 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
 			hash = tbl->hash(n->primary_key, n->dev,
 					 new_nht->hash_rnd);
 
-			hash &= new_nht->hash_mask;
+			hash >>= (32 - new_nht->hash_shift);
 			next = rcu_dereference_protected(n->next,
 						lockdep_is_held(&tbl->lock));
 
@@ -406,7 +405,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 
 	rcu_read_lock_bh();
 	nht = rcu_dereference_bh(tbl->nht);
-	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
 
 	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
 	     n != NULL;
@@ -436,7 +435,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 
 	rcu_read_lock_bh();
 	nht = rcu_dereference_bh(tbl->nht);
-	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
+	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
 
 	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
 	     n != NULL;
@@ -492,10 +491,10 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	nht = rcu_dereference_protected(tbl->nht,
 					lockdep_is_held(&tbl->lock));
 
-	if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
-		nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
+	if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+		nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
 
-	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
 
 	if (n->parms->dead) {
 		rc = ERR_PTR(-EINVAL);
@@ -784,7 +783,7 @@ static void neigh_periodic_work(struct work_struct *work)
 				neigh_rand_reach_time(p->base_reachable_time);
 	}
 
-	for (i = 0 ; i <= nht->hash_mask; i++) {
+	for (i = 0 ; i < (1 << nht->hash_shift); i++) {
 		np = &nht->hash_buckets[i];
 
 		while ((n = rcu_dereference_protected(*np,
@@ -1540,7 +1539,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 		panic("cannot create neighbour proc dir entry");
 #endif
 
-	RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8));
+	RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
 
 	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
 	tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
@@ -1857,7 +1856,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 		rcu_read_lock_bh();
 		nht = rcu_dereference_bh(tbl->nht);
 		ndc.ndtc_hash_rnd = nht->hash_rnd;
-		ndc.ndtc_hash_mask = nht->hash_mask;
+		ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
 		rcu_read_unlock_bh();
 
 		NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
@@ -2200,7 +2199,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 	rcu_read_lock_bh();
 	nht = rcu_dereference_bh(tbl->nht);
 
-	for (h = 0; h <= nht->hash_mask; h++) {
+	for (h = 0; h < (1 << nht->hash_shift); h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
@@ -2264,7 +2263,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
 	nht = rcu_dereference_bh(tbl->nht);
 
 	read_lock(&tbl->lock); /* avoid resizes */
-	for (chain = 0; chain <= nht->hash_mask; chain++) {
+	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
 		struct neighbour *n;
 
 		for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
@@ -2286,7 +2285,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
 
 	nht = rcu_dereference_protected(tbl->nht,
 					lockdep_is_held(&tbl->lock));
-	for (chain = 0; chain <= nht->hash_mask; chain++) {
+	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
 		struct neighbour *n;
 		struct neighbour __rcu **np;
 
@@ -2323,7 +2322,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 	int bucket = state->bucket;
 
 	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
-	for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
+	for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
 		n = rcu_dereference_bh(nht->hash_buckets[bucket]);
 
 		while (n) {
@@ -2390,7 +2389,7 @@ next:
 		if (n)
 			break;
 
-		if (++state->bucket > nht->hash_mask)
+		if (++state->bucket >= (1 << nht->hash_shift))
 			break;
 
 		n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
-- 
1.7.6


^ permalink raw reply related

* Re: [PATCH v2 00/46] Clean up RX copybreak and DMA handling
From: Michał Mirosław @ 2011-07-11  9:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110710.235458.1549578255936886669.davem@davemloft.net>

On Sun, Jul 10, 2011 at 11:54:58PM -0700, David Miller wrote:
> From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> Date: Mon, 11 Jul 2011 02:52:46 +0200 (CEST)
> 
> >   1. under packet storm and memory pressure NIC keeps generating interrupts
> >      (if non-NAPI) and indicating new buffers because it always has free
> >      RX buffers --- this only wastes CPU and bus bandwidth transferring
> >      data that is going to be immediately discarded;
> Actually, this is exactly how I, and others advise people to implement
> drivers.  It is the right thing to do.
> 
> The worst thing that can happen is to let the RX ring empty of
> buffers.  Some cards hang as a result of this, and also it causes head
> of line blocking on multiqueue cards, etc.
> 
> So the first thing the driver should do is try to allocate a
> replacement buffer.
> 
> And if that fails, it should give the RX packet right back to the
> card, and not pass it up the stack.

For now, lets ignore those badly broken cards which can't cope with
insufficient receive buffers. (BTW, are there that many of them?
Some examples, please?)

Lets compare the two cases (replacing buffers immediately vs replacing
later) under the hostile conditions. Keep in mind that the strategy
doesn't matter much when the buffers can be allocated right away --- the
discussion is about the corner case when memory runs out.

1. replacing buffers immediately

Packet is indicated in queue N, theres no memory for new skb, so its
dropped, and the buffer goes back to free list. In parallel, queue M
(!= N) indicates new packet. Still, there's no memory for new skb so
its also dropped and its buffer is reused. The effect is that all
packets are dropped, whatever queue they appear on.

2. replacing buffers later

Packet is indicated in queue N, its delivered up the stack. No new buffer
is available, so after a while queue stalls and the packets are dropped
by the card. If the queues share free buffer list, then all get stalled
at the same time, if not they run out independently. Net effect is the
same as above --- all packets are dropped.

The differences are:
 - where the packets are dropped:
   1. in driver core after transfer
   2. in the card
 - where accounting happens:
   1. in driver: rx_dropped
   2. in card: rx discards
 - memory usage:
   1. memory is held in empty rx ring buffers
   2. memory is held in packets waiting to be processed
 - CPU usage:
   1. >0% - queues are cleared repeatedly, card 'thinks' everything is ok
   2. 0% - queues are stalled, no more rx indications
 - hardware throttling (or pause frame generation):
   1. broken --- card always sees full free rx ring, so does not try to
      throttle (unless driver also indicates congestion to the card)
   2. hardware throttling is possible as the card sees only really free
      rx buffers

The HOL blocking does not matter here, because there's only one head ---
the system memory. If I misunderstood this point, please explain it further.

Scheme #1 has the potential use when combined with small emergency buffer
pool if the driver looks for specific packets or indications that come
in the same queue as other packets. These are rare cases, though.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH v2 00/46] Clean up RX copybreak and DMA handling
From: David Miller @ 2011-07-11  9:24 UTC (permalink / raw)
  To: mirq-linux; +Cc: netdev
In-Reply-To: <20110711091649.GA6380@rere.qmqm.pl>

From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Mon, 11 Jul 2011 11:16:49 +0200

> Packet is indicated in queue N, theres no memory for new skb, so its
> dropped, and the buffer goes back to free list. In parallel, queue M
> (!= N) indicates new packet. Still, there's no memory for new skb so
> its also dropped and its buffer is reused. The effect is that all
> packets are dropped, whatever queue they appear on.

Why would queue M (!= N) fail just because N did?  They may be
allocating out of different NUMA nodes, and thus succeed.

> The HOL blocking does not matter here, because there's only one head
> --- the system memory. If I misunderstood this point, please explain
> it further.

Multiqueue drivers are moving towards placing the queues on different
NUMA nodes, and in that scenerio one queue might succeed even if the
other fails.

Back to the hardware hanging issue, it's real.  Getting into a
situation where the RX ring lacks any buffers at all is the least
tested path for these chips.

Testing fate is a really bad idea, and this is why I always propose to
keep the hardware with RX buffers to use in all circumstances.

^ permalink raw reply

* Re: [PATCH v2 03/46] net drivers: remove unnecessary dma_sync_to_device(DMA_FROM_DEVICE)
From: Michał Mirosław @ 2011-07-11  9:29 UTC (permalink / raw)
  To: Vlad Zolotarov
  Cc: linux-wireless@vger.kernel.org, Eilon Greenstein, Gary Zambrano,
	Stephen Hemminger, Stefano Brivio,
	e1000-devel@lists.sourceforge.net, Matthew Carlson,
	Jesse Brandeburg, Francois Romieu, Realtek linux nic maintainers,
	John W. Linville, Ron Mercer, Michael Chan, Jitendra Kalsaria,
	Divy Le Ray, netdev@vger.kernel.org, Bruce Allan, Hartley Sweeten,
	John Ronciak, Jon 
In-Reply-To: <201107111130.39629.vladz@broadcom.com>

On Mon, Jul 11, 2011 at 11:30:39AM +0300, Vlad Zolotarov wrote:
> >         prod_rx_buf->skb = skb;
> > diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h
> > index c016e20..c9e49a0 100644
> > --- a/drivers/net/bnx2x/bnx2x_cmn.h
> > +++ b/drivers/net/bnx2x/bnx2x_cmn.h
> > @@ -923,16 +923,11 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x
> > *bp, static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
> >                                       u16 cons, u16 prod)
> >  {
> > -       struct bnx2x *bp = fp->bp;
> >         struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
> >         struct sw_rx_bd *prod_rx_buf = &fp->rx_buf_ring[prod];
> >         struct eth_rx_bd *cons_bd = &fp->rx_desc_ring[cons];
> >         struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
> > 
> > -       dma_sync_single_for_device(&bp->pdev->dev,
> > -                                  dma_unmap_addr(cons_rx_buf, mapping),
> > -                                  RX_COPY_THRESH, DMA_FROM_DEVICE);
> > -
> >         dma_unmap_addr_set(prod_rx_buf, mapping,
> >                            dma_unmap_addr(cons_rx_buf, mapping));
> >         prod_rx_buf->skb = cons_rx_buf->skb;
> Michal, pls., note that this function is only called for buffers which were 
> previously dma_synced towards CPU (your "[PATCH v2 05/46] net: bnx2x: fix DMA 
> sync direction" properly fixes the direction of the first call which was 
> incorrect). Then, according to the 3d edition of the "Linux device drivers" 
> book, chapter 15, "Setting up streaming DMA mappings" article, end of the page 
> 449, when we call for dma_syc_single_for_cpu() the buffer ownership gets to 
> the CPU and CPU may safely access the buffer (in particular, we read it). Then 
> the author says: "Before the device accesses the buffer, however, ownership 
> should be transfered back to it with: dma_sync_single_for_device().
> 
> The DMA-API.txt document u've referenced doesn't refer the above function, so, 
> it's unclear how your fix may be based on it. On the other hand it clearly 
> contradicts the "Linux device driver" book.

DMA-API.txt describes what synchronization points are necessary for what DMA
mapping types (direction). dma_sync_single_for_cpu/device() are functions
realising those points. Note that example DMA-API-HOWTO.txt is misleading
as it has dma_sync_single_for_device() where its not required by DMA-API.txt.

In this case, you don't need to sync to device for mappings that haven't
been written to by CPU. CPU caches will be invalidated anyway by next
dma_sync_single_for_cpu() or dma_unmap_single() and the CPU should not
ever write to cachelines that belong to FROM_DEVICE mappings.

The best source is the code. I looked through random implementations of
dma_sync_*_to_*() and in to_device() cases these are CPU write buffer
flushes and bounce buffer copying to the mapping - both actions are useless
(and potentially harmful in the bounce-buffer case) when the mapping hasn't
been written to after sync_to_cpu().

Best Regards,
Michał Mirosław

------------------------------------------------------------------------------
All of the data generated in your IT infrastructure is seriously valuable.
Why? It contains a definitive record of application performance, security 
threats, fraudulent activity, and more. Splunk takes this data and makes 
sense of it. IT sense. And common sense.
http://p.sf.net/sfu/splunk-d2d-c2
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [RFC 43/72] a2065/ariadne: Move the a2065/ariadne drivers
From: Jeff Kirsher @ 2011-07-11  9:39 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: davem@davemloft.net, netdev@vger.kernel.org
In-Reply-To: <CAMuHMdXfpbbfv7oT+0oDWJd0=x0BBqEeCcGWaxMHagTXoeCscQ@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 4130 bytes --]

On Sun, 2011-07-10 at 23:33 -0700, Geert Uytterhoeven wrote:
> On Mon, Jul 11, 2011 at 02:48, Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:
> > On Sun, 2011-07-10 at 12:34 -0700, Geert Uytterhoeven wrote:
> >> On Sat, Jul 9, 2011 at 16:30, Jeff Kirsher
> >> <jeffrey.t.kirsher@intel.com> wrote:
> >> > On Tue, 2011-06-28 at 13:33 -0700, Geert Uytterhoeven wrote:
> >> >> And (in some other patch) 82596.c is an Intel driver, not a
> >> Motorola driver.
> >> >
> >> > 82596.c is not an Intel driver, it is an Intel part.  The driver was
> >>
> >> Sorry, I meant "driver for an Intel part".
> >>
> >> > written and support by someone other than Intel.  I am looking at
> >> how to
> >>
> >> Sure. But I'm strongly against classifying drivers based on who wrote
> >> them ;-)
> >
> > I agree to some extent, because if that were the case, we would have a
> > donald_becker/ directory for several of the drivers. :)
> >
> > Here is one of the problem's I keep running into and there is no simple
> > answer.  While most of the drivers can be grouped together by the
> > hardware they use, that does not work "logically" for every driver.
> >
> > In addition, if vendor 'A' makes a part and vendor 'B' uses same part in
> > a device/system/NIC and vendor 'B' creates the driver, supports the
> > driver and maintains the driver.  Should the part be categorized under
> > vendor 'A'?  IMHO, I think it should be categorized as a vendor 'B'
> > driver.
> 
> Several of the Ethernet drivers are of a third type: vendor A chip, vendor B
> card, entity C software.
> 
> > I started this work with the idea of trying to organize the drivers in
> > the same way that the drivers were to be in the Kconfig, which tended to
> > be drivers/net/ethernet/<manufacturer>.
> >
> > One of the problems that arise in this organization is what do we do
> > when vendor A is bought by vendor B, and vendor B takes on the support
> > of all the old vendor A parts/drivers?
> 
> We don't care. We don't sort drivers by who support them. Eventually, vendors
> lose interest and they all end up under "Linux kernel community" anyway.

It may just be me, these statements seem negative and bitter and is not
helping us "solve" the issue.  While the statements may be true,  I
would like to try and find a solution, what ever it may be, to better
organize drivers/net/ethernet/ drivers to help with maintenance and
future development.

> 
> > So I am open to suggestions.  The process I have using to organize the
> > drivers has been to group drivers that use common libraries and/or code
> > first, then group by either manufacturer, maintainer, or common
> > platform.
> >
> > I would like to keep the lasi_82506.c, sni_82596.c, 82506.c and similar
> > drivers out of the intel/ directory because we would not be supporting
> > the drivers and they are not similar to our drivers that we do support
> > that would be in the intel/ directory.
> >
> > Again, I open to suggestions on how to best organize these types of
> > drivers.  Maybe create a misc/ or <bus_type>/ for these types of
> > drivers?
> 
> "Similar" drivers should be together and consolidated (if someone has time
> to do it). They can even be of different brands.
> I.e. not all Tulip-compatibles were manufactured by Digital or Intel.

I agree and understand, that is why I am taking the time to do it.  The
drivers/net/ethernet/8390/, drivers/net/ethernet/tulip and
drivers/net/ethernet/sun/ are some examples of this.

> 
> >> > better organize the 82596.c, lasi_82596.c, lib82596.c, and
> >> sni_82596.c
> >> > which all use an Intel Ethernet chip but were written and supported
> >> by
> >> > someone other than Intel.
> 
> Gr{oetje,eeting}s,
> 
>                         Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                 -- Linus Torvalds



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: [PATCH v2 03/46] net drivers: remove unnecessary dma_sync_to_device(DMA_FROM_DEVICE)
From: Vlad Zolotarov @ 2011-07-11  9:46 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: linux-wireless@vger.kernel.org, Eilon Greenstein, Gary Zambrano,
	Stephen Hemminger, Stefano Brivio,
	e1000-devel@lists.sourceforge.net, Matthew Carlson,
	Jesse Brandeburg, Francois Romieu, Realtek linux nic maintainers,
	John W. Linville, Ron Mercer, Michael Chan, Jitendra Kalsaria,
	Divy Le Ray, netdev@vger.kernel.org, Bruce Allan, Hartley Sweeten,
	John Ronciak, Jon 
In-Reply-To: <20110711092909.GB6380@rere.qmqm.pl>

On Monday 11 July 2011 12:29:09 Michał Mirosław wrote:
> On Mon, Jul 11, 2011 at 11:30:39AM +0300, Vlad Zolotarov wrote:
> > >         prod_rx_buf->skb = skb;
> > > 
> > > diff --git a/drivers/net/bnx2x/bnx2x_cmn.h
> > > b/drivers/net/bnx2x/bnx2x_cmn.h index c016e20..c9e49a0 100644
> > > --- a/drivers/net/bnx2x/bnx2x_cmn.h
> > > +++ b/drivers/net/bnx2x/bnx2x_cmn.h
> > > @@ -923,16 +923,11 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x
> > > *bp, static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
> > > 
> > >                                       u16 cons, u16 prod)
> > >  
> > >  {
> > > 
> > > -       struct bnx2x *bp = fp->bp;
> > > 
> > >         struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
> > >         struct sw_rx_bd *prod_rx_buf = &fp->rx_buf_ring[prod];
> > >         struct eth_rx_bd *cons_bd = &fp->rx_desc_ring[cons];
> > >         struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
> > > 
> > > -       dma_sync_single_for_device(&bp->pdev->dev,
> > > -                                  dma_unmap_addr(cons_rx_buf,
> > > mapping), -                                  RX_COPY_THRESH,
> > > DMA_FROM_DEVICE); -
> > > 
> > >         dma_unmap_addr_set(prod_rx_buf, mapping,
> > >         
> > >                            dma_unmap_addr(cons_rx_buf, mapping));
> > >         
> > >         prod_rx_buf->skb = cons_rx_buf->skb;
> > 
> > Michal, pls., note that this function is only called for buffers which
> > were previously dma_synced towards CPU (your "[PATCH v2 05/46] net:
> > bnx2x: fix DMA sync direction" properly fixes the direction of the first
> > call which was incorrect). Then, according to the 3d edition of the
> > "Linux device drivers" book, chapter 15, "Setting up streaming DMA
> > mappings" article, end of the page 449, when we call for
> > dma_syc_single_for_cpu() the buffer ownership gets to the CPU and CPU
> > may safely access the buffer (in particular, we read it). Then the
> > author says: "Before the device accesses the buffer, however, ownership
> > should be transfered back to it with: dma_sync_single_for_device().
> > 
> > The DMA-API.txt document u've referenced doesn't refer the above
> > function, so, it's unclear how your fix may be based on it. On the other
> > hand it clearly contradicts the "Linux device driver" book.
> 
> DMA-API.txt describes what synchronization points are necessary for what
> DMA mapping types (direction). dma_sync_single_for_cpu/device() are
> functions realising those points. Note that example DMA-API-HOWTO.txt is
> misleading as it has dma_sync_single_for_device() where its not required
> by DMA-API.txt.
> 
> In this case, you don't need to sync to device for mappings that haven't
> been written to by CPU. CPU caches will be invalidated anyway by next
> dma_sync_single_for_cpu() or dma_unmap_single() and the CPU should not
> ever write to cachelines that belong to FROM_DEVICE mappings.

Okay, I see the section in the doc u r talking about... I agree. We may drop 
these sync_single() in the bnx2x_reuse_rx_skb().

> 
> The best source is the code. 

Hmmm... The code is bug prone, so I'd stick to the Doc...;)

Thanks, Michal.
vlad


------------------------------------------------------------------------------
All of the data generated in your IT infrastructure is seriously valuable.
Why? It contains a definitive record of application performance, security 
threats, fraudulent activity, and more. Splunk takes this data and makes 
sense of it. IT sense. And common sense.
http://p.sf.net/sfu/splunk-d2d-c2
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox