Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2] net/macb: Use non-coherent memory for rx buffers
From: Nicolas Ferre @ 2012-12-03 12:14 UTC (permalink / raw)
  To: David S. Miller, netdev
  Cc: linux-arm-kernel, linux-kernel, Joachim Eastwood,
	Jean-Christophe PLAGNIOL-VILLARD, Havard Skinnemoen,
	Nicolas Ferre
In-Reply-To: <CAGhQ9VwWmP_B17LnAEqDHcL8Yd-s0C-Bf35z3+=vnJmK_NoCSw@mail.gmail.com>

From: Havard Skinnemoen <havard@skinnemoen.net>

Allocate regular pages to use as backing for the RX ring and use the
DMA API to sync the caches. This should give a bit better performance
since it allows the CPU to do burst transfers from memory. It is also
a necessary step on the way to reduce the amount of copying done by
the driver.

Signed-off-by: Havard Skinnemoen <havard@skinnemoen.net>
[nicolas.ferre@atmel.com: adapt to newer kernel]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
v2: - keep struct macb members as they are shared between
      at91_ether and macb.

 drivers/net/ethernet/cadence/macb.c | 206 +++++++++++++++++++++++-------------
 drivers/net/ethernet/cadence/macb.h |  18 ++++
 2 files changed, 148 insertions(+), 76 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 6a59bce..c2955da 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -10,6 +10,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/clk.h>
+#include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
@@ -35,6 +36,8 @@
 #define RX_BUFFER_SIZE		128
 #define RX_RING_SIZE		512 /* must be power of 2 */
 #define RX_RING_BYTES		(sizeof(struct macb_dma_desc) * RX_RING_SIZE)
+#define RX_BUFFERS_PER_PAGE	(PAGE_SIZE / RX_BUFFER_SIZE)
+#define RX_RING_PAGES		(RX_RING_SIZE / RX_BUFFERS_PER_PAGE)
 
 #define TX_RING_SIZE		128 /* must be power of 2 */
 #define TX_RING_BYTES		(sizeof(struct macb_dma_desc) * TX_RING_SIZE)
@@ -90,9 +93,16 @@ static struct macb_dma_desc *macb_rx_desc(struct macb *bp, unsigned int index)
 	return &bp->rx_ring[macb_rx_ring_wrap(index)];
 }
 
-static void *macb_rx_buffer(struct macb *bp, unsigned int index)
+static struct macb_rx_page *macb_rx_page(struct macb *bp, unsigned int index)
 {
-	return bp->rx_buffers + RX_BUFFER_SIZE * macb_rx_ring_wrap(index);
+	unsigned int entry = macb_rx_ring_wrap(index);
+
+	return &bp->rx_page[entry / RX_BUFFERS_PER_PAGE];
+}
+
+static unsigned int macb_rx_page_offset(struct macb *bp, unsigned int index)
+{
+	return (index % RX_BUFFERS_PER_PAGE) * RX_BUFFER_SIZE;
 }
 
 void macb_set_hwaddr(struct macb *bp)
@@ -528,11 +538,15 @@ static void macb_tx_interrupt(struct macb *bp)
 static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 			 unsigned int last_frag)
 {
-	unsigned int len;
-	unsigned int frag;
-	unsigned int offset;
-	struct sk_buff *skb;
-	struct macb_dma_desc *desc;
+	unsigned int		len;
+	unsigned int		frag;
+	unsigned int		skb_offset;
+	unsigned int		pg_offset;
+	struct macb_rx_page	*rx_page;
+	dma_addr_t		phys;
+	void			*buf;
+	struct sk_buff		*skb;
+	struct macb_dma_desc	*desc;
 
 	desc = macb_rx_desc(bp, last_frag);
 	len = MACB_BFEXT(RX_FRMLEN, desc->ctrl);
@@ -566,7 +580,7 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 		return 1;
 	}
 
-	offset = 0;
+	skb_offset = 0;
 	len += NET_IP_ALIGN;
 	skb_checksum_none_assert(skb);
 	skb_put(skb, len);
@@ -574,13 +588,28 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 	for (frag = first_frag; ; frag++) {
 		unsigned int frag_len = RX_BUFFER_SIZE;
 
-		if (offset + frag_len > len) {
+		if (skb_offset + frag_len > len) {
 			BUG_ON(frag != last_frag);
-			frag_len = len - offset;
+			frag_len = len - skb_offset;
 		}
-		skb_copy_to_linear_data_offset(skb, offset,
-				macb_rx_buffer(bp, frag), frag_len);
-		offset += RX_BUFFER_SIZE;
+
+		rx_page = macb_rx_page(bp, frag);
+		pg_offset = macb_rx_page_offset(bp, frag);
+		phys = rx_page->phys;
+
+		dma_sync_single_range_for_cpu(&bp->pdev->dev, phys,
+				pg_offset, frag_len, DMA_FROM_DEVICE);
+
+		buf = kmap_atomic(rx_page->page);
+		skb_copy_to_linear_data_offset(skb, skb_offset,
+				buf + pg_offset, frag_len);
+		kunmap_atomic(buf);
+
+		skb_offset += frag_len;
+
+		dma_sync_single_range_for_device(&bp->pdev->dev, phys,
+				pg_offset, frag_len, DMA_FROM_DEVICE);
+
 		desc = macb_rx_desc(bp, frag);
 		desc->addr &= ~MACB_BIT(RX_USED);
 
@@ -860,86 +889,90 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static void macb_free_consistent(struct macb *bp)
+static void macb_free_rings(struct macb *bp)
 {
-	if (bp->tx_skb) {
-		kfree(bp->tx_skb);
-		bp->tx_skb = NULL;
-	}
-	if (bp->rx_ring) {
-		dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES,
-				  bp->rx_ring, bp->rx_ring_dma);
-		bp->rx_ring = NULL;
-	}
-	if (bp->tx_ring) {
-		dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES,
-				  bp->tx_ring, bp->tx_ring_dma);
-		bp->tx_ring = NULL;
-	}
-	if (bp->rx_buffers) {
-		dma_free_coherent(&bp->pdev->dev,
-				  RX_RING_SIZE * RX_BUFFER_SIZE,
-				  bp->rx_buffers, bp->rx_buffers_dma);
-		bp->rx_buffers = NULL;
+	int i;
+
+	for (i = 0; i < RX_RING_PAGES; i++) {
+		struct macb_rx_page *rx_page = &bp->rx_page[i];
+
+		if (!rx_page->page)
+			continue;
+
+		dma_unmap_page(&bp->pdev->dev, rx_page->phys,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+		put_page(rx_page->page);
+		rx_page->page = NULL;
 	}
+
+	kfree(bp->tx_skb);
+	kfree(bp->rx_page);
+	dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES, bp->tx_ring,
+			  bp->tx_ring_dma);
+	dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES, bp->rx_ring,
+			  bp->rx_ring_dma);
 }
 
-static int macb_alloc_consistent(struct macb *bp)
+static int macb_init_rings(struct macb *bp)
 {
-	int size;
+	struct page	*page;
+	dma_addr_t	phys;
+	unsigned int	page_idx;
+	unsigned int	ring_idx;
+	unsigned int	i;
 
-	size = TX_RING_SIZE * sizeof(struct macb_tx_skb);
-	bp->tx_skb = kmalloc(size, GFP_KERNEL);
-	if (!bp->tx_skb)
-		goto out_err;
-
-	size = RX_RING_BYTES;
-	bp->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
+	bp->rx_ring = dma_alloc_coherent(&bp->pdev->dev, RX_RING_BYTES,
 					 &bp->rx_ring_dma, GFP_KERNEL);
 	if (!bp->rx_ring)
-		goto out_err;
+		goto err_alloc_rx_ring;
+
 	netdev_dbg(bp->dev,
 		   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
-		   size, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
+		   RX_RING_BYTES, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
 
-	size = TX_RING_BYTES;
-	bp->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
+	bp->tx_ring = dma_alloc_coherent(&bp->pdev->dev, TX_RING_BYTES,
 					 &bp->tx_ring_dma, GFP_KERNEL);
 	if (!bp->tx_ring)
-		goto out_err;
-	netdev_dbg(bp->dev,
-		   "Allocated TX ring of %d bytes at %08lx (mapped %p)\n",
-		   size, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
-
-	size = RX_RING_SIZE * RX_BUFFER_SIZE;
-	bp->rx_buffers = dma_alloc_coherent(&bp->pdev->dev, size,
-					    &bp->rx_buffers_dma, GFP_KERNEL);
-	if (!bp->rx_buffers)
-		goto out_err;
+		goto err_alloc_tx_ring;
+
 	netdev_dbg(bp->dev,
-		   "Allocated RX buffers of %d bytes at %08lx (mapped %p)\n",
-		   size, (unsigned long)bp->rx_buffers_dma, bp->rx_buffers);
+		   "Allocated TX ring of %d bytes at 0x%08lx (mapped %p)\n",
+		   TX_RING_BYTES, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
 
-	return 0;
+	bp->rx_page = kcalloc(RX_RING_PAGES, sizeof(struct macb_rx_page),
+			      GFP_KERNEL);
+	if (!bp->rx_page)
+		goto err_alloc_rx_page;
 
-out_err:
-	macb_free_consistent(bp);
-	return -ENOMEM;
-}
+	bp->tx_skb = kcalloc(TX_RING_SIZE, sizeof(struct macb_tx_skb),
+			     GFP_KERNEL);
+	if (!bp->tx_skb)
+		goto err_alloc_tx_skb;
 
-static void macb_init_rings(struct macb *bp)
-{
-	int i;
-	dma_addr_t addr;
+	for (page_idx = 0, ring_idx = 0; page_idx < RX_RING_PAGES; page_idx++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			goto err_alloc_page;
+
+		phys = dma_map_page(&bp->pdev->dev, page, 0, PAGE_SIZE,
+				    DMA_FROM_DEVICE);
+		if (dma_mapping_error(&bp->pdev->dev, phys))
+			goto err_map_page;
+
+		bp->rx_page[page_idx].page = page;
+		bp->rx_page[page_idx].phys = phys;
 
-	addr = bp->rx_buffers_dma;
-	for (i = 0; i < RX_RING_SIZE; i++) {
-		bp->rx_ring[i].addr = addr;
-		bp->rx_ring[i].ctrl = 0;
-		addr += RX_BUFFER_SIZE;
+		for (i = 0; i < RX_BUFFERS_PER_PAGE; i++, ring_idx++) {
+			bp->rx_ring[ring_idx].addr = phys;
+			bp->rx_ring[ring_idx].ctrl = 0;
+			phys += RX_BUFFER_SIZE;
+		}
 	}
 	bp->rx_ring[RX_RING_SIZE - 1].addr |= MACB_BIT(RX_WRAP);
 
+	netdev_dbg(bp->dev, "Allocated %u RX buffers (%lu pages)\n",
+		   RX_RING_SIZE, RX_RING_PAGES);
+
 	for (i = 0; i < TX_RING_SIZE; i++) {
 		bp->tx_ring[i].addr = 0;
 		bp->tx_ring[i].ctrl = MACB_BIT(TX_USED);
@@ -947,6 +980,28 @@ static void macb_init_rings(struct macb *bp)
 	bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
 
 	bp->rx_tail = bp->tx_head = bp->tx_tail = 0;
+
+	return 0;
+
+err_map_page:
+	__free_page(page);
+err_alloc_page:
+	while (page_idx--) {
+		dma_unmap_page(&bp->pdev->dev, bp->rx_page[page_idx].phys,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+		__free_page(bp->rx_page[page_idx].page);
+	}
+	kfree(bp->tx_skb);
+err_alloc_tx_skb:
+	kfree(bp->rx_page);
+err_alloc_rx_page:
+	dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES, bp->tx_ring,
+			  bp->tx_ring_dma);
+err_alloc_tx_ring:
+	dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES, bp->rx_ring,
+			  bp->rx_ring_dma);
+err_alloc_rx_ring:
+	return -ENOMEM;
 }
 
 static void macb_reset_hw(struct macb *bp)
@@ -1221,16 +1276,15 @@ static int macb_open(struct net_device *dev)
 	if (!bp->phy_dev)
 		return -EAGAIN;
 
-	err = macb_alloc_consistent(bp);
+	err = macb_init_rings(bp);
 	if (err) {
-		netdev_err(dev, "Unable to allocate DMA memory (error %d)\n",
+		netdev_err(dev, "Unable to allocate DMA rings (error %d)\n",
 			   err);
 		return err;
 	}
 
 	napi_enable(&bp->napi);
 
-	macb_init_rings(bp);
 	macb_init_hw(bp);
 
 	/* schedule a link state check */
@@ -1257,7 +1311,7 @@ static int macb_close(struct net_device *dev)
 	netif_carrier_off(dev);
 	spin_unlock_irqrestore(&bp->lock, flags);
 
-	macb_free_consistent(bp);
+	macb_free_rings(bp);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 570908b..e82242b 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -453,6 +453,23 @@ struct macb_dma_desc {
 #define MACB_TX_USED_SIZE			1
 
 /**
+ * struct macb_rx_page - data associated with a page used as RX buffers
+ * @page: Physical page used as storage for the buffers
+ * @phys: DMA address of the page
+ *
+ * Each page is used to provide %MACB_RX_BUFFERS_PER_PAGE RX buffers.
+ * The page gets an initial reference when it is inserted into the
+ * ring, and an additional reference each time it is passed up the
+ * stack as a fragment. When all the buffers have been used, we drop
+ * the initial reference and allocate a new page. Any additional
+ * references are dropped when the higher layers free the skb.
+ */
+struct macb_rx_page {
+	struct page		*page;
+	dma_addr_t		phys;
+};
+
+/**
  * struct macb_tx_skb - data about an skb which is being transmitted
  * @skb: skb currently being transmitted
  * @mapping: DMA address of the skb's data buffer
@@ -543,6 +560,7 @@ struct macb {
 
 	unsigned int		rx_tail;
 	struct macb_dma_desc	*rx_ring;
+	struct macb_rx_page	*rx_page;
 	void			*rx_buffers;
 
 	unsigned int		tx_head, tx_tail;
-- 
1.8.0

^ permalink raw reply related

* [PATCH] net/macb: Use dmapool to align descriptors on 64bits
From: Nicolas Ferre @ 2012-12-03 12:15 UTC (permalink / raw)
  To: David S. Miller, netdev
  Cc: linux-arm-kernel, linux-kernel, Joachim Eastwood,
	Jean-Christophe PLAGNIOL-VILLARD, Nicolas Ferre

Depending on datapath, some revisions of GEM need
64bits aligned descriptors. Use dmapool to allocate
these descriptors.
Note that different size between RX and TX rings
leads to the creation of two pools.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c | 99 ++++++++++++++++++++++++-------------
 drivers/net/ethernet/cadence/macb.h |  3 ++
 2 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index c2955da..4b541a3 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/dmapool.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_data/macb.h>
 #include <linux/platform_device.h>
@@ -889,6 +890,47 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
+/*
+ * Retrieve the maximum supported data bus width from decoding the
+ * design configuration register.
+ *
+ * Result in bytes.
+ */
+static u32 macb_dma_bus_width(struct macb *bp)
+{
+	u32 dbwdef;
+
+	if (!macb_is_gem(bp))
+		return 4;
+
+	dbwdef = GEM_BFEXT(DBWDEF, gem_readl(bp, DCFG1));
+	switch (dbwdef) {
+	case 4:
+	case 2:
+		return dbwdef * 4;
+	case 1:
+	default:
+		return 4;
+	}
+}
+
+/*
+ * Get the DMA bus width field of the network configuration register that we
+ * should program.
+ */
+static u32 macb_dbw(struct macb *bp)
+{
+	switch (macb_dma_bus_width(bp)) {
+	case 16:
+		return GEM_BF(DBW, GEM_DBW128);
+	case 8:
+		return GEM_BF(DBW, GEM_DBW64);
+	case 4:
+	default:
+		return GEM_BF(DBW, GEM_DBW32);
+	}
+}
+
 static void macb_free_rings(struct macb *bp)
 {
 	int i;
@@ -907,10 +949,12 @@ static void macb_free_rings(struct macb *bp)
 
 	kfree(bp->tx_skb);
 	kfree(bp->rx_page);
-	dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES, bp->tx_ring,
-			  bp->tx_ring_dma);
-	dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES, bp->rx_ring,
-			  bp->rx_ring_dma);
+	dma_pool_free(bp->tx_pool, bp->tx_ring, bp->tx_ring_dma);
+	dma_pool_free(bp->rx_pool, bp->rx_ring, bp->rx_ring_dma);
+	dma_pool_destroy(bp->tx_pool);
+	dma_pool_destroy(bp->rx_pool);
+	bp->tx_pool = NULL;
+	bp->rx_pool = NULL;
 }
 
 static int macb_init_rings(struct macb *bp)
@@ -920,9 +964,16 @@ static int macb_init_rings(struct macb *bp)
 	unsigned int	page_idx;
 	unsigned int	ring_idx;
 	unsigned int	i;
+	unsigned int	dma_bw = macb_dma_bus_width(bp);
+
+	bp->rx_pool = dma_pool_create(bp->pdev->name, &bp->pdev->dev,
+				      RX_RING_BYTES, dma_bw, 0);
+	bp->tx_pool = dma_pool_create(bp->pdev->name, &bp->pdev->dev,
+				      TX_RING_BYTES, dma_bw, 0);
+	if (!bp->rx_pool || !bp->tx_pool)
+		goto err_alloc_rx_ring;
 
-	bp->rx_ring = dma_alloc_coherent(&bp->pdev->dev, RX_RING_BYTES,
-					 &bp->rx_ring_dma, GFP_KERNEL);
+	bp->rx_ring = dma_pool_alloc(bp->rx_pool, GFP_KERNEL, &bp->rx_ring_dma);
 	if (!bp->rx_ring)
 		goto err_alloc_rx_ring;
 
@@ -930,8 +981,7 @@ static int macb_init_rings(struct macb *bp)
 		   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
 		   RX_RING_BYTES, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
 
-	bp->tx_ring = dma_alloc_coherent(&bp->pdev->dev, TX_RING_BYTES,
-					 &bp->tx_ring_dma, GFP_KERNEL);
+	bp->tx_ring = dma_pool_alloc(bp->tx_pool, GFP_KERNEL, &bp->tx_ring_dma);
 	if (!bp->tx_ring)
 		goto err_alloc_tx_ring;
 
@@ -995,12 +1045,16 @@ err_alloc_page:
 err_alloc_tx_skb:
 	kfree(bp->rx_page);
 err_alloc_rx_page:
-	dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES, bp->tx_ring,
-			  bp->tx_ring_dma);
+	dma_pool_free(bp->tx_pool, bp->tx_ring, bp->tx_ring_dma);
 err_alloc_tx_ring:
-	dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES, bp->rx_ring,
-			  bp->rx_ring_dma);
+	dma_pool_free(bp->rx_pool, bp->rx_ring, bp->rx_ring_dma);
 err_alloc_rx_ring:
+	if (bp->tx_pool)
+		dma_pool_destroy(bp->tx_pool);
+	if (bp->rx_pool)
+		dma_pool_destroy(bp->rx_pool);
+	bp->rx_pool = NULL;
+	bp->tx_pool = NULL;
 	return -ENOMEM;
 }
 
@@ -1067,27 +1121,6 @@ static u32 macb_mdc_clk_div(struct macb *bp)
 }
 
 /*
- * Get the DMA bus width field of the network configuration register that we
- * should program.  We find the width from decoding the design configuration
- * register to find the maximum supported data bus width.
- */
-static u32 macb_dbw(struct macb *bp)
-{
-	if (!macb_is_gem(bp))
-		return 0;
-
-	switch (GEM_BFEXT(DBWDEF, gem_readl(bp, DCFG1))) {
-	case 4:
-		return GEM_BF(DBW, GEM_DBW128);
-	case 2:
-		return GEM_BF(DBW, GEM_DBW64);
-	case 1:
-	default:
-		return GEM_BF(DBW, GEM_DBW32);
-	}
-}
-
-/*
  * Configure the receive DMA engine
  * - use the correct receive buffer size
  * - set the possibility to use INCR16 bursts
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index e82242b..b4c9515 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -584,6 +584,9 @@ struct macb {
 	dma_addr_t		tx_ring_dma;
 	dma_addr_t		rx_buffers_dma;
 
+	struct dma_pool		*rx_pool;
+	struct dma_pool		*tx_pool;
+
 	struct mii_bus		*mii_bus;
 	struct phy_device	*phy_dev;
 	unsigned int 		link;
-- 
1.8.0

^ permalink raw reply related

* [PATCH] net/macb: increase RX buffer size for GEM
From: Nicolas Ferre @ 2012-12-03 12:15 UTC (permalink / raw)
  To: David S. Miller, netdev
  Cc: linux-arm-kernel, linux-kernel, Joachim Eastwood,
	Jean-Christophe PLAGNIOL-VILLARD, Nicolas Ferre

Macb Ethernet controller requires a RX buffer of 128 bytes. It is
highly sub-optimal for Gigabit-capable GEM that is able to use
a bigger DMA buffer. Change this constant and associated macros
with data stored in the private structure.
I also kept the result of buffers per page calculation to lower the
impact of this move to a variable rx buffer size on rx hot path.
RX DMA buffer size has to be multiple of 64 bytes as indicated in
DMA Configuration Register specification.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c | 61 ++++++++++++++++++++++++++++---------
 drivers/net/ethernet/cadence/macb.h |  2 ++
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 4b541a3..b4f45f4 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -34,11 +34,11 @@
 
 #include "macb.h"
 
-#define RX_BUFFER_SIZE		128
+#define MACB_RX_BUFFER_SIZE	128
+#define GEM_RX_BUFFER_SIZE	2048
+#define RX_BUFFER_MULTIPLE	64  /* bytes */
 #define RX_RING_SIZE		512 /* must be power of 2 */
 #define RX_RING_BYTES		(sizeof(struct macb_dma_desc) * RX_RING_SIZE)
-#define RX_BUFFERS_PER_PAGE	(PAGE_SIZE / RX_BUFFER_SIZE)
-#define RX_RING_PAGES		(RX_RING_SIZE / RX_BUFFERS_PER_PAGE)
 
 #define TX_RING_SIZE		128 /* must be power of 2 */
 #define TX_RING_BYTES		(sizeof(struct macb_dma_desc) * TX_RING_SIZE)
@@ -98,12 +98,17 @@ static struct macb_rx_page *macb_rx_page(struct macb *bp, unsigned int index)
 {
 	unsigned int entry = macb_rx_ring_wrap(index);
 
-	return &bp->rx_page[entry / RX_BUFFERS_PER_PAGE];
+	return &bp->rx_page[entry / bp->rx_buffers_per_page];
 }
 
 static unsigned int macb_rx_page_offset(struct macb *bp, unsigned int index)
 {
-	return (index % RX_BUFFERS_PER_PAGE) * RX_BUFFER_SIZE;
+	return (index % bp->rx_buffers_per_page) * bp->rx_buffer_size;
+}
+
+static unsigned int rx_ring_pages(struct macb *bp)
+{
+	return RX_RING_SIZE / bp->rx_buffers_per_page;
 }
 
 void macb_set_hwaddr(struct macb *bp)
@@ -587,7 +592,7 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 	skb_put(skb, len);
 
 	for (frag = first_frag; ; frag++) {
-		unsigned int frag_len = RX_BUFFER_SIZE;
+		unsigned int frag_len = bp->rx_buffer_size;
 
 		if (skb_offset + frag_len > len) {
 			BUG_ON(frag != last_frag);
@@ -931,11 +936,36 @@ static u32 macb_dbw(struct macb *bp)
 	}
 }
 
+static void macb_init_rx_buffer_size(struct macb *bp)
+{
+	if (!macb_is_gem(bp)) {
+		bp->rx_buffer_size = MACB_RX_BUFFER_SIZE;
+	} else {
+		bp->rx_buffer_size = GEM_RX_BUFFER_SIZE;
+
+		if (bp->rx_buffer_size > PAGE_SIZE) {
+			netdev_warn(bp->dev,
+				    "RX buffer cannot be bigger than PAGE_SIZE, shrinking\n");
+			bp->rx_buffer_size = PAGE_SIZE;
+		}
+		if (bp->rx_buffer_size % RX_BUFFER_MULTIPLE) {
+			netdev_warn(bp->dev,
+				    "RX buffer must be multiple of %d bytes, shrinking\n",
+				    RX_BUFFER_MULTIPLE);
+			bp->rx_buffer_size =
+				rounddown(bp->rx_buffer_size, RX_BUFFER_MULTIPLE);
+		}
+		bp->rx_buffer_size = max(RX_BUFFER_MULTIPLE, GEM_RX_BUFFER_SIZE);
+	}
+
+	bp->rx_buffers_per_page = PAGE_SIZE / bp->rx_buffer_size;
+}
+
 static void macb_free_rings(struct macb *bp)
 {
 	int i;
 
-	for (i = 0; i < RX_RING_PAGES; i++) {
+	for (i = 0; i < rx_ring_pages(bp); i++) {
 		struct macb_rx_page *rx_page = &bp->rx_page[i];
 
 		if (!rx_page->page)
@@ -989,7 +1019,10 @@ static int macb_init_rings(struct macb *bp)
 		   "Allocated TX ring of %d bytes at 0x%08lx (mapped %p)\n",
 		   TX_RING_BYTES, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
 
-	bp->rx_page = kcalloc(RX_RING_PAGES, sizeof(struct macb_rx_page),
+	/* RX buffers initialization */
+	macb_init_rx_buffer_size(bp);
+
+	bp->rx_page = kcalloc(rx_ring_pages(bp), sizeof(struct macb_rx_page),
 			      GFP_KERNEL);
 	if (!bp->rx_page)
 		goto err_alloc_rx_page;
@@ -999,7 +1032,7 @@ static int macb_init_rings(struct macb *bp)
 	if (!bp->tx_skb)
 		goto err_alloc_tx_skb;
 
-	for (page_idx = 0, ring_idx = 0; page_idx < RX_RING_PAGES; page_idx++) {
+	for (page_idx = 0, ring_idx = 0; page_idx < rx_ring_pages(bp); page_idx++) {
 		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			goto err_alloc_page;
@@ -1012,16 +1045,16 @@ static int macb_init_rings(struct macb *bp)
 		bp->rx_page[page_idx].page = page;
 		bp->rx_page[page_idx].phys = phys;
 
-		for (i = 0; i < RX_BUFFERS_PER_PAGE; i++, ring_idx++) {
+		for (i = 0; i < bp->rx_buffers_per_page; i++, ring_idx++) {
 			bp->rx_ring[ring_idx].addr = phys;
 			bp->rx_ring[ring_idx].ctrl = 0;
-			phys += RX_BUFFER_SIZE;
+			phys += bp->rx_buffer_size;
 		}
 	}
 	bp->rx_ring[RX_RING_SIZE - 1].addr |= MACB_BIT(RX_WRAP);
 
-	netdev_dbg(bp->dev, "Allocated %u RX buffers (%lu pages)\n",
-		   RX_RING_SIZE, RX_RING_PAGES);
+	netdev_dbg(bp->dev, "Allocated %u RX buffers of size %u (%u pages)\n",
+		   RX_RING_SIZE, bp->rx_buffer_size, rx_ring_pages(bp));
 
 	for (i = 0; i < TX_RING_SIZE; i++) {
 		bp->tx_ring[i].addr = 0;
@@ -1134,7 +1167,7 @@ static void macb_configure_dma(struct macb *bp)
 
 	if (macb_is_gem(bp)) {
 		dmacfg = gem_readl(bp, DMACFG) & ~GEM_BF(RXBS, -1L);
-		dmacfg |= GEM_BF(RXBS, RX_BUFFER_SIZE / 64);
+		dmacfg |= GEM_BF(RXBS, bp->rx_buffer_size / RX_BUFFER_MULTIPLE);
 		dmacfg |= GEM_BF(FBLDO, 16);
 		dmacfg |= GEM_BIT(TXPBMS) | GEM_BF(RXBMS, -1L);
 		gem_writel(bp, DMACFG, dmacfg);
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index b4c9515..88780e2 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -562,6 +562,8 @@ struct macb {
 	struct macb_dma_desc	*rx_ring;
 	struct macb_rx_page	*rx_page;
 	void			*rx_buffers;
+	size_t			rx_buffer_size;
+	unsigned int		rx_buffers_per_page;
 
 	unsigned int		tx_head, tx_tail;
 	struct macb_dma_desc	*tx_ring;
-- 
1.8.0

^ permalink raw reply related

* [PATCH] stmmac: remove two repeated macros
From: Liming Wang @ 2012-12-03 12:19 UTC (permalink / raw)
  To: Giuseppe CAVALLARO, David Miller; +Cc: netdev

Two macros have been defined twice, remove them.

Signed-off-by: Liming Wang <walimisdev@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h |    2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
index e49c9a0..d064d92 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h
@@ -77,8 +77,6 @@
 #define DMA_STATUS_GPI		0x10000000	/* PMT interrupt */
 #define DMA_STATUS_GMI		0x08000000	/* MMC interrupt */
 #define DMA_STATUS_GLI		0x04000000	/* GMAC Line interface int */
-#define DMA_STATUS_GMI		0x08000000
-#define DMA_STATUS_GLI		0x04000000
 #define DMA_STATUS_EB_MASK	0x00380000	/* Error Bits Mask */
 #define DMA_STATUS_EB_TX_ABORT	0x00080000	/* Error Bits - TX Abort */
 #define DMA_STATUS_EB_RX_ABORT	0x00100000	/* Error Bits - RX Abort */
-- 
1.7.9.5

^ permalink raw reply related

* RE: [PATCH v2] net/macb: Use non-coherent memory for rx buffers
From: David Laight @ 2012-12-03 12:43 UTC (permalink / raw)
  To: Nicolas Ferre, David S. Miller, netdev
  Cc: linux-arm-kernel, linux-kernel, Joachim Eastwood,
	Jean-Christophe PLAGNIOL-VILLARD, Havard Skinnemoen
In-Reply-To: <1354536876-6274-1-git-send-email-nicolas.ferre@atmel.com>

> Allocate regular pages to use as backing for the RX ring and use the
> DMA API to sync the caches. This should give a bit better performance
> since it allows the CPU to do burst transfers from memory. It is also
> a necessary step on the way to reduce the amount of copying done by
> the driver.

I've not tried to understand the patches, but you have to be
very careful using non-snooped memory for descriptor rings.
No amount of DMA API calls can sort out some of the issues.

Basically you must not dirty a cache line that contains data
that the MAC unit might still write to.

For the receive ring this means that you must not setup
new rx buffers for ring entries until the MAC unit has
filled all the ring entries in the same cache line.
This probably means only adding rx buffers in blocks
of 8 or 16 (or even more if there are large cache lines).

I can't see any code in the patch that does this.

Doing the same for the tx ring is more difficult, especially
if you can't stop the MAC unit polling the TX ring on a
timer basis.
Basically you can only give the MAX tx packets if either
it is idle, or if the tx ring containing the new entries
starts on a cache line.
If the MAC unit is polling the ring, then to give it
multiple items you may need to update the 'owner' bit
in the first ring entry last - just in case the cache
line gets written out before you've finished.

	David

^ permalink raw reply

* [PATCH] net: ICMPv6 packets transmitted on wrong interface if nfmark is mangled
From: Dries De Winter @ 2012-12-03 12:46 UTC (permalink / raw)
  To: David Miller; +Cc: pablo, kaber, netdev, netfilter-devel, Dries De Winter
In-Reply-To: <20121130.122243.710720011890818822.davem@davemloft.net>

The IPv6 mangle table may change the source/destination address and skb->mark
of a packet. Therefore it may be necessary to "reroute" a packet after it
traversed this table. But this should not happen for some special packets like
neighbour solicitations and MLD reports: they have an explicit destination, not
originating from the routing table. Rerouting these packets may cause them to
go out on the wrong interface or not to go out at all depending on the routing
table.

This patch allows to mark a dst_entry as "non-reroutable". icmp6_dst_alloc()
(used by ndisc and MLD implementation) will always mark the allocated dst_entry
as such. A check is added to netfilter (IPv6-only) so packets heading for a
non-reroutable destination are never rerouted.

Remarks:

(1) dst entries allocated by addrconf_dst_alloc() are added to the routing
table like normal routes and skbuffs get assigned such dst entries by normal
rule lookup / route lookup. Therefore it's not needed to mark those dst
entries as non-reroutable: if an skbuff got assigned such a dst entry by
normal routing in the first place, and the changes done by the mangle table
don't affect routing, rerouting the packet will get it there too.

(2) Similar logic exists in IPv4 so local multicast/broadcast messages are
potentially transmitted on the wrong interface. However, it's a less likely
corner case there because those packets are treated differently by local
output routing: multicast/broadcast messages are by default routed to the
interface with a matching source IP-address. But this logic is invalid because
it is allowed to (1) send messages with a source IP-address different from
your own and (2) to assign the same IP-address on multiple interfaces.
So ideally in IPv4 some dsts should be marked as non-reroutable as well.

Signed-off-by: Dries De Winter <dries.dewinter@gmail.com>
---
 include/net/dst.h    |    1 +
 net/ipv6/netfilter.c |    4 ++++
 net/ipv6/route.c     |    2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 9a78810..cb6ae51 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -61,6 +61,7 @@ struct dst_entry {
 #define DST_NOPEER		0x0040
 #define DST_FAKE_RTABLE		0x0080
 #define DST_XFRM_TUNNEL		0x0100
+#define DST_NOREROUTE		0x0200

 	unsigned short		pending_confirm;

diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 429089c..cf9e871 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -24,6 +24,10 @@ int ip6_route_me_harder(struct sk_buff *skb)
 		.saddr = iph->saddr,
 	};

+	dst = skb_dst(skb);
+	if (dst && (dst->flags & DST_NOREROUTE))
+		return 0;
+
 	dst = ip6_route_output(net, skb->sk, &fl6);
 	if (dst->error) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b1e6cf0..8fa7db5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1225,7 +1225,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 		}
 	}

-	rt->dst.flags |= DST_HOST;
+	rt->dst.flags |= DST_HOST | DST_NOREROUTE;
 	rt->dst.output  = ip6_output;
 	rt->n = neigh;
 	atomic_set(&rt->dst.__refcnt, 1);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH net-next v2] bridge: export multicast database via netlink
From: Cong Wang @ 2012-12-03 13:03 UTC (permalink / raw)
  To: netdev
  Cc: bridge, Cong Wang, Herbert Xu, Stephen Hemminger, David S. Miller,
	Thomas Graf, Jesper Dangaard Brouer

V2: drop patch 1/2, export ifindex directly
    Redesign netlink attributes
    Improve netlink seq check
    Handle IPv6 addr as well

TODO: remove debugging printk's

This patch exports bridge multicast database via netlink
message type RTM_GETMDB. Similar to fdb, but currently bridge-specific.
We may need to support modify multicast database too (RTM_{ADD,DEL}MDB).

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
    
---
 include/uapi/linux/if_bridge.h |   55 +++++++++++++
 include/uapi/linux/rtnetlink.h |    3 +
 net/bridge/Makefile            |    2 +-
 net/bridge/br_mdb.c            |  173 ++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_multicast.c      |    2 +
 net/bridge/br_private.h        |    2 +
 6 files changed, 236 insertions(+), 1 deletions(-)

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index b388579..9a0f6ff 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -116,4 +116,59 @@ enum {
 	__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
+
+/* Bridge multicast database attributes
+ * [MDBA_MDB] = {
+ *     [MDBA_MDB_ENTRY] = {
+ *         [MDBA_MDB_ENTRY_INFO]
+ *     }
+ * }
+ * [MDBA_ROUTER] = {
+ *    [MDBA_ROUTER_PORT]
+ * }
+ */
+enum {
+	MDBA_UNSPEC,
+	MDBA_MDB,
+	MDBA_ROUTER,
+	__MDBA_MAX,
+};
+#define MDBA_MAX (__MDBA_MAX - 1)
+
+enum {
+	MDBA_MDB_UNSPEC,
+	MDBA_MDB_ENTRY,
+	__MDBA_MDB_MAX,
+};
+#define MDBA_MDB_MAX (__MDBA_MDB_MAX - 1)
+
+enum {
+	MDBA_MDB_ENTRY_UNSPEC,
+	MDBA_MDB_ENTRY_INFO,
+	__MDBA_MDB_ENTRY_MAX,
+};
+#define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1)
+
+enum {
+	MDBA_ROUTER_UNSPEC,
+	MDBA_ROUTER_PORT,
+	__MDBA_ROUTER_MAX,
+};
+#define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
+
+struct br_port_msg {
+	__u32 ifindex;
+};
+
+struct br_mdb_entry {
+	__u32 ifindex;
+	struct {
+		union {
+			__be32	ip4;
+			struct in6_addr ip6;
+		} u;
+		__be16		proto;
+	} addr;
+};
+
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 3dee071..0df623f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -125,6 +125,9 @@ enum {
 	RTM_GETNETCONF = 82,
 #define RTM_GETNETCONF RTM_GETNETCONF
 
+	RTM_GETMDB = 86,
+#define RTM_GETMDB RTM_GETMDB
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index d0359ea..e859098 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -12,6 +12,6 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
 bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
-bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
 obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
new file mode 100644
index 0000000..6ab6149
--- /dev/null
+++ b/net/bridge/br_mdb.c
@@ -0,0 +1,173 @@
+#include <linux/err.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+			       struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p;
+	struct hlist_node *n;
+	struct nlattr *nest;
+
+	if (!br->multicast_router || hlist_empty(&br->router_list)) {
+		printk(KERN_INFO "no router on bridge\n");
+		return 0;
+	}
+
+	nest = nla_nest_start(skb, MDBA_ROUTER);
+	if (nest == NULL)
+		return -EMSGSIZE;
+
+	hlist_for_each_entry_rcu(p, n, &br->router_list, rlist) {
+		if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+			goto fail;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+fail:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+			    struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_mdb_htable *mdb;
+	struct nlattr *nest, *nest2;
+	int i, err = 0;
+	int idx = 0, s_idx = cb->args[1];
+
+	if (br->multicast_disabled) {
+		printk(KERN_INFO "multicast is disabled on bridge\n");
+		return 0;
+	}
+
+	mdb = rcu_dereference(br->mdb);
+	if (!mdb) {
+		printk(KERN_INFO "no mdb on bridge\n");
+		return 0;
+	}
+
+	nest = nla_nest_start(skb, MDBA_MDB);
+	if (nest == NULL)
+		return -EMSGSIZE;
+
+	printk(KERN_INFO "s_idx = %d\n", s_idx);
+	for (i = 0; i < mdb->max; i++) {
+		struct hlist_node *h;
+		struct net_bridge_mdb_entry *mp;
+		struct net_bridge_port_group *p, **pp;
+		struct net_bridge_port *port;
+
+		hlist_for_each_entry_rcu(mp, h, &mdb->mhash[i], hlist[mdb->ver]) {
+			if (idx < s_idx)
+				goto skip;
+
+			nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+			if (nest2 == NULL) {
+				err = -EMSGSIZE;
+				goto out;
+			}
+
+			for (pp = &mp->ports;
+			     (p = rcu_dereference(*pp)) != NULL;
+			      pp = &p->next) {
+				port = p->port;
+				if (port) {
+					struct br_mdb_entry e;
+					e.ifindex = port->dev->ifindex;
+					e.addr.u.ip4 = p->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+					e.addr.u.ip6 = p->addr.u.ip6;
+#endif
+					e.addr.proto = p->addr.proto;
+					if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+						nla_nest_cancel(skb, nest2);
+						err = -EMSGSIZE;
+						goto out;
+					}
+					printk(KERN_INFO "port %s, mcaddr: %pI4\n", port->dev->name, &p->addr.u.ip4);
+				}
+			}
+			nla_nest_end(skb, nest2);
+		skip:
+			idx++;
+		}
+	}
+
+out:
+	cb->args[1] = idx;
+	cb->args[2] = mdb->seq;
+	nla_nest_end(skb, nest);
+	return err;
+}
+
+static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net_device *dev;
+	struct net *net = sock_net(skb->sk);
+	struct nlmsghdr *nlh;
+	int idx = 0, s_idx;
+
+	s_idx = cb->args[0];
+
+	rcu_read_lock();
+
+	for_each_netdev_rcu(net, dev) {
+		if (dev->priv_flags & IFF_EBRIDGE) {
+			struct br_port_msg *bpm;
+
+			if (idx < s_idx)
+				goto cont;
+
+			nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, RTM_GETMDB,
+					sizeof(*bpm), NLM_F_MULTI);
+			if (nlh == NULL)
+				break;
+
+			bpm = nlmsg_data(nlh);
+			bpm->ifindex = dev->ifindex;
+			if (br_mdb_fill_info(skb, cb, dev) < 0) {
+				printk(KERN_INFO "br_mdb_fill_info failed\n");
+				goto out;
+			}
+			if (br_rports_fill_info(skb, cb, dev) < 0) {
+				printk(KERN_INFO "br_rports_fill_info failed\n");
+				goto out;
+			}
+
+			nlmsg_end(skb, nlh);
+		cont:
+			idx++;
+		}
+	}
+
+out:
+	cb->seq = cb->args[2];
+	rcu_read_unlock();
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+void br_mdb_init(void)
+{
+	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2417434..d53e4f4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -322,6 +322,7 @@ static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
 
 	mdb->size = old ? old->size : 0;
 	mdb->ver = old ? old->ver ^ 1 : 0;
+	mdb->seq = old ? (old->seq + 1): 0;
 
 	if (!old || elasticity)
 		get_random_bytes(&mdb->secret, sizeof(mdb->secret));
@@ -1584,6 +1585,7 @@ void br_multicast_init(struct net_bridge *br)
 		    br_multicast_querier_expired, (unsigned long)br);
 	setup_timer(&br->multicast_query_timer, br_multicast_query_expired,
 		    (unsigned long)br);
+	br_mdb_init();
 }
 
 void br_multicast_open(struct net_bridge *br)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index eb9cd42..6484069 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -105,6 +105,7 @@ struct net_bridge_mdb_htable
 	u32				max;
 	u32				secret;
 	u32				ver;
+	u32				seq;
 };
 
 struct net_bridge_port
@@ -432,6 +433,7 @@ extern int br_multicast_set_port_router(struct net_bridge_port *p,
 extern int br_multicast_toggle(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+extern void br_mdb_init(void);
 
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {

^ permalink raw reply related

* [PATCH iproute2 v2] Add mdb command to bridge
From: Cong Wang @ 2012-12-03 13:03 UTC (permalink / raw)
  To: netdev
  Cc: bridge, Cong Wang, Herbert Xu, Stephen Hemminger, David S. Miller,
	Thomas Graf, Jesper Dangaard Brouer
In-Reply-To: <1354539824-7898-1-git-send-email-amwang@redhat.com>

V2: sync with the kernel patch
    handle IPv6 addr
    a few cleanup

Sample output:

	# ./bridge/bridge mdb
	bridge dev br0
	multicast database:
	port eth0, group 224.8.8.9
	port eth1, group 224.8.8.8
	router ports: 2

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 bridge/Makefile    |    2 +-
 bridge/br_common.h |    1 +
 bridge/bridge.c    |    1 +
 bridge/mdb.c       |  181 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 184 insertions(+), 1 deletions(-)

diff --git a/bridge/Makefile b/bridge/Makefile
index 9a6743e..67aceb4 100644
--- a/bridge/Makefile
+++ b/bridge/Makefile
@@ -1,4 +1,4 @@
-BROBJ = bridge.o fdb.o monitor.o link.o
+BROBJ = bridge.o fdb.o monitor.o link.o mdb.o
 
 include ../Config
 
diff --git a/bridge/br_common.h b/bridge/br_common.h
index 718ecb9..67fd75c 100644
--- a/bridge/br_common.h
+++ b/bridge/br_common.h
@@ -5,6 +5,7 @@ extern int print_fdb(const struct sockaddr_nl *who,
 		     struct nlmsghdr *n, void *arg);
 
 extern int do_fdb(int argc, char **argv);
+extern int do_mdb(int argc, char **argv);
 extern int do_monitor(int argc, char **argv);
 
 extern int preferred_family;
diff --git a/bridge/bridge.c b/bridge/bridge.c
index e2c33b0..1fcd365 100644
--- a/bridge/bridge.c
+++ b/bridge/bridge.c
@@ -43,6 +43,7 @@ static const struct cmd {
 	int (*func)(int argc, char **argv);
 } cmds[] = {
 	{ "fdb", 	do_fdb },
+	{ "mdb", 	do_mdb },
 	{ "monitor",	do_monitor },
 	{ "help",	do_help },
 	{ 0 }
diff --git a/bridge/mdb.c b/bridge/mdb.c
new file mode 100644
index 0000000..3e62331
--- /dev/null
+++ b/bridge/mdb.c
@@ -0,0 +1,181 @@
+/*
+ * Get mdb table with netlink
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <linux/if_bridge.h>
+#include <linux/if_ether.h>
+#include <linux/neighbour.h>
+#include <linux/if_bridge.h>
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "libnetlink.h"
+#include "br_common.h"
+#include "rt_names.h"
+#include "utils.h"
+
+#ifndef MDBA_RTA
+#define MDBA_RTA(r) \
+	((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct br_port_msg))))
+#endif
+
+int filter_index;
+
+static void usage(void)
+{
+	fprintf(stderr, "       bridge mdb {show} [ dev DEV ]\n");
+	exit(-1);
+}
+
+static void br_print_router_ports(FILE *f, struct rtattr *attr)
+{
+	uint32_t *port_ifindex;
+	struct rtattr *i;
+	int rem;
+
+	rem = RTA_PAYLOAD(attr);
+	for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
+		port_ifindex = RTA_DATA(i);
+		fprintf(f, "%s ", ll_index_to_name(*port_ifindex));
+	}
+	fprintf(f, "\n");
+}
+
+static void print_mdb_entry(FILE *f, struct br_mdb_entry *e)
+{
+	SPRINT_BUF(abuf);
+
+	if (e->addr.proto == htons(ETH_P_IP))
+		fprintf(f, "port %s, group %s\n", ll_index_to_name(e->ifindex),
+			inet_ntop(AF_INET, &e->addr.u.ip4, abuf, sizeof(abuf)));
+	else
+		fprintf(f, "port %s, group %s\n", ll_index_to_name(e->ifindex),
+			inet_ntop(AF_INET6, &e->addr.u.ip6, abuf, sizeof(abuf)));
+}
+
+static void br_print_mdb_entry(FILE *f, struct rtattr *attr)
+{
+	struct rtattr *i;
+	int rem;
+	struct br_mdb_entry *e;
+
+	rem = RTA_PAYLOAD(attr);
+	for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
+		e = RTA_DATA(i);
+		print_mdb_entry(f, e);
+	}
+}
+
+int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	FILE *fp = arg;
+	struct br_port_msg *r = NLMSG_DATA(n);
+	int len = n->nlmsg_len;
+	struct rtattr * tb[MDBA_MAX+1];
+
+	if (n->nlmsg_type != RTM_GETMDB) {
+		fprintf(stderr, "Not RTM_GETMDB: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+
+		return 0;
+	}
+
+	len -= NLMSG_LENGTH(sizeof(*r));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	if (filter_index && filter_index != r->ifindex)
+		return 0;
+
+	if (!filter_index && r->ifindex)
+		fprintf(fp, "bridge dev %s\n", ll_index_to_name(r->ifindex));
+
+	parse_rtattr(tb, MDBA_MAX, MDBA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
+
+	if (tb[MDBA_MDB]) {
+		struct rtattr *i;
+		int rem = RTA_PAYLOAD(tb[MDBA_MDB]);
+
+		fprintf(fp, "multicast database:\n");
+
+		for (i = RTA_DATA(tb[MDBA_MDB]); RTA_OK(i, rem); i = RTA_NEXT(i, rem))
+			br_print_mdb_entry(fp, i);
+	}
+
+	if (tb[MDBA_ROUTER]) {
+		struct rtattr *i;
+		int rem = RTA_PAYLOAD(tb[MDBA_ROUTER]);
+
+		fprintf(fp, "router ports: ");
+
+		for (i = RTA_DATA(tb[MDBA_ROUTER]); RTA_OK(i, rem); i = RTA_NEXT(i, rem))
+			br_print_router_ports(fp, i);
+	}
+
+	return 0;
+}
+
+static int mdb_show(int argc, char **argv)
+{
+	char *filter_dev = NULL;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "dev") == 0) {
+			NEXT_ARG();
+			if (filter_dev)
+				duparg("dev", *argv);
+			filter_dev = *argv;
+		}
+		argc--; argv++;
+	}
+
+	if (filter_dev) {
+		filter_index = if_nametoindex(filter_dev);
+		if (filter_index == 0) {
+			fprintf(stderr, "Cannot find device \"%s\"\n",
+				filter_dev);
+			return -1;
+		}
+	}
+
+	if (rtnl_wilddump_request(&rth, PF_BRIDGE, RTM_GETMDB) < 0) {
+		perror("Cannot send dump request");
+		exit(1);
+	}
+
+	if (rtnl_dump_filter(&rth, print_mdb, stdout) < 0) {
+		fprintf(stderr, "Dump terminated\n");
+		exit(1);
+	}
+
+	return 0;
+}
+
+int do_mdb(int argc, char **argv)
+{
+	ll_init_map(&rth);
+
+	if (argc > 0) {
+		if (matches(*argv, "show") == 0 ||
+		    matches(*argv, "lst") == 0 ||
+		    matches(*argv, "list") == 0)
+			return mdb_show(argc-1, argv+1);
+		if (matches(*argv, "help") == 0)
+			usage();
+	} else
+		return mdb_show(0, NULL);
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"bridge mdb help\".\n", *argv);
+	exit(-1);
+}

^ permalink raw reply related

* [PATCH net-next 0/2] igb: fixes and improvements for irq fallback
From: Stefan Assmann @ 2012-12-03 13:14 UTC (permalink / raw)
  To: netdev
  Cc: e1000-devel, alexander.h.duyck, carolyn.wyborny,
	jeffrey.t.kirsher, sassmann

The interrupt fallback code should utilize the same code that's used for normal
setup instead of duplicating it. It also has some shortcomings when it comes
to situations where for some reason request_irq() fails to allocate vectors for
MSI-X. Hoping to address these with this patchset.

Would be nice for these patches to go through Intel testing before inclusion.

Stefan Assmann (2):
  igb: remove duplicate code for fallback interrupt initialization
  igb: release already assigned MSI-X interrupts if setup fails

-- 
1.7.11.7

^ permalink raw reply

* [PATCH net-next 2/2] igb: release already assigned MSI-X interrupts if setup fails
From: Stefan Assmann @ 2012-12-03 13:15 UTC (permalink / raw)
  To: netdev
  Cc: e1000-devel, alexander.h.duyck, carolyn.wyborny,
	jeffrey.t.kirsher, sassmann
In-Reply-To: <1354540501-18407-1-git-send-email-sassmann@kpanic.de>

During MSI-X setup the system might run out of vectors. If this happens the
already assigned vectors for this NIC should be freed before trying the
disable MSI-X. Failing to do so results in the following oops.

kernel BUG at drivers/pci/msi.c:341!
[...]
Call Trace:
 [<ffffffff8128f39d>] pci_disable_msix+0x3d/0x60
 [<ffffffffa037d1ce>] igb_reset_interrupt_capability+0x27/0x5c [igb]
 [<ffffffffa037d229>] igb_clear_interrupt_scheme+0x26/0x2d [igb]
 [<ffffffffa0384268>] igb_request_irq+0x73/0x297 [igb]
 [<ffffffffa0384554>] __igb_open+0xc8/0x223 [igb]
 [<ffffffffa0384815>] igb_open+0x13/0x15 [igb]
 [<ffffffff8144592f>] __dev_open+0xbf/0x120
 [<ffffffff81443e51>] __dev_change_flags+0xa1/0x180
 [<ffffffff81445828>] dev_change_flags+0x28/0x70
 [<ffffffff814af537>] devinet_ioctl+0x5b7/0x620
 [<ffffffff814b01c8>] inet_ioctl+0x88/0xa0
 [<ffffffff8142e8a0>] sock_do_ioctl+0x30/0x70
 [<ffffffff8142ecf2>] sock_ioctl+0x72/0x270
 [<ffffffff8118062c>] do_vfs_ioctl+0x8c/0x340
 [<ffffffff81180981>] sys_ioctl+0xa1/0xb0
 [<ffffffff815161a9>] system_call_fastpath+0x16/0x1b
Code: 48 89 df e8 1f 40 ed ff 4d 39 e6 49 8b 45 10 75 b6 48 83 c4 18 5b 41 5c 41 5d 41 5e 41 5f c9 c3 48 8b 7b 20 e8 3e 91 db ff eb ae <0f> 0b eb fe 0f 1f 84 00 00 00 00 00 55 48 89 e5 0f 1f 44 00 00
RIP  [<ffffffff8128e144>] free_msi_irqs+0x124/0x130
 RSP <ffff880037503bd8>

Signed-off-by: Stefan Assmann <sassmann@kpanic.de>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index b0dd5ef..0007b97 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -832,17 +832,18 @@ static int igb_request_msix(struct igb_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_hw *hw = &adapter->hw;
-	int i, err = 0, vector = 0;
+	int i, err = 0, vector = 0, free_vector = 0;
 
 	err = request_irq(adapter->msix_entries[vector].vector,
 	                  igb_msix_other, 0, netdev->name, adapter);
 	if (err)
-		goto out;
-	vector++;
+		goto err_out;
 
 	for (i = 0; i < adapter->num_q_vectors; i++) {
 		struct igb_q_vector *q_vector = adapter->q_vector[i];
 
+		vector++;
+
 		q_vector->itr_register = hw->hw_addr + E1000_EITR(vector);
 
 		if (q_vector->rx.ring && q_vector->tx.ring)
@@ -861,13 +862,22 @@ static int igb_request_msix(struct igb_adapter *adapter)
 		                  igb_msix_ring, 0, q_vector->name,
 		                  q_vector);
 		if (err)
-			goto out;
-		vector++;
+			goto err_free;
 	}
 
 	igb_configure_msix(adapter);
 	return 0;
-out:
+
+err_free:
+	/* free already assigned IRQs */
+	free_irq(adapter->msix_entries[free_vector++].vector, adapter);
+
+	vector--;
+	for (i = 0; i < vector; i++) {
+		free_irq(adapter->msix_entries[free_vector++].vector,
+			 adapter->q_vector[i]);
+	}
+err_out:
 	return err;
 }
 
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH net-next 1/2] igb: remove duplicate code for fallback interrupt initialization
From: Stefan Assmann @ 2012-12-03 13:15 UTC (permalink / raw)
  To: netdev
  Cc: e1000-devel, alexander.h.duyck, carolyn.wyborny,
	jeffrey.t.kirsher, sassmann
In-Reply-To: <1354540501-18407-1-git-send-email-sassmann@kpanic.de>

Given a small change to igb_init_interrupt_scheme() the function fits
igb_request_irq() for MSI/legacy interrupts initialization as well, instead of
duplicating most of its code there.

Also adding a missing igb_configure() to igb_request_irq() for MSI fallback
to work properly.

Signed-off-by: Stefan Assmann <sassmann@kpanic.de>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index b85b15a..b0dd5ef 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -122,6 +122,7 @@ static void __devexit igb_remove(struct pci_dev *pdev);
 static int igb_sw_init(struct igb_adapter *);
 static int igb_open(struct net_device *);
 static int igb_close(struct net_device *);
+static void igb_configure(struct igb_adapter *);
 static void igb_configure_tx(struct igb_adapter *);
 static void igb_configure_rx(struct igb_adapter *);
 static void igb_clean_all_tx_rings(struct igb_adapter *);
@@ -948,11 +949,14 @@ static void igb_clear_interrupt_scheme(struct igb_adapter *adapter)
  * Attempt to configure interrupts using the best available
  * capabilities of the hardware and kernel.
  **/
-static void igb_set_interrupt_capability(struct igb_adapter *adapter)
+static void igb_set_interrupt_capability(struct igb_adapter *adapter, bool msix)
 {
 	int err;
 	int numvecs, i;
 
+	if (!msix)
+		goto msi_only;
+
 	/* Number of supported queues. */
 	adapter->num_rx_queues = adapter->rss_queues;
 	if (adapter->vfs_allocated_count)
@@ -1199,12 +1203,12 @@ err_out:
  *
  * This function initializes the interrupts and allocates all of the queues.
  **/
-static int igb_init_interrupt_scheme(struct igb_adapter *adapter)
+static int igb_init_interrupt_scheme(struct igb_adapter *adapter, bool msix)
 {
 	struct pci_dev *pdev = adapter->pdev;
 	int err;
 
-	igb_set_interrupt_capability(adapter);
+	igb_set_interrupt_capability(adapter, msix);
 
 	err = igb_alloc_q_vectors(adapter);
 	if (err) {
@@ -1240,20 +1244,15 @@ static int igb_request_irq(struct igb_adapter *adapter)
 		/* fall back to MSI */
 		igb_free_all_tx_resources(adapter);
 		igb_free_all_rx_resources(adapter);
+
 		igb_clear_interrupt_scheme(adapter);
-		if (!pci_enable_msi(pdev))
-			adapter->flags |= IGB_FLAG_HAS_MSI;
-		adapter->num_tx_queues = 1;
-		adapter->num_rx_queues = 1;
-		adapter->num_q_vectors = 1;
-		err = igb_alloc_q_vectors(adapter);
-		if (err) {
-			dev_err(&pdev->dev,
-			        "Unable to allocate memory for vectors\n");
+		err = igb_init_interrupt_scheme(adapter, false);
+		if (err)
 			goto request_done;
-		}
+
 		igb_setup_all_tx_resources(adapter);
 		igb_setup_all_rx_resources(adapter);
+		igb_configure(adapter);
 	}
 
 	igb_assign_vector(adapter->q_vector[0], 0);
@@ -2444,7 +2443,7 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 				GFP_ATOMIC);
 
 	/* This call may decrease the number of queues */
-	if (igb_init_interrupt_scheme(adapter)) {
+	if (igb_init_interrupt_scheme(adapter, true)) {
 		dev_err(&pdev->dev, "Unable to allocate memory for queues\n");
 		return -ENOMEM;
 	}
@@ -6818,7 +6817,7 @@ static int igb_resume(struct device *dev)
 	pci_enable_wake(pdev, PCI_D3hot, 0);
 	pci_enable_wake(pdev, PCI_D3cold, 0);
 
-	if (igb_init_interrupt_scheme(adapter)) {
+	if (igb_init_interrupt_scheme(adapter, true)) {
 		dev_err(&pdev->dev, "Unable to allocate memory for queues\n");
 		return -ENOMEM;
 	}
-- 
1.7.11.7

^ permalink raw reply related

* [PATCH] tun: only queue packets on device
From: Michael S. Tsirkin @ 2012-12-03 13:19 UTC (permalink / raw)
  To: David S. Miller, netdev
  Cc: Jason Wang, Michael S. Tsirkin, Neil Horman, Rami Rosen,
	linux-kernel

Historically tun supported two modes of operation:
- in default mode, a small number of packets would get queued
  at the device, the rest would be queued in qdisc
- in one queue mode, all packets would get queued at the device

This might have made sense up to a point where we made the
queue depth for both modes the same and set it to
a huge value (500) so unless the consumer
is stuck the chance of losing packets is small.

Thus in practice both modes behave the same, but the
default mode has some problems:
- if packets are never consumed, fragments are never orphaned
  which cases a DOS for sender using zero copy transmit
- overrun errors are hard to diagnose: fifo error is incremented
  only once so you can not distinguish between
  userspace that is stuck and a transient failure,
  tcpdump on the device does not show any traffic

Userspace solves this simply by enabling IFF_ONE_QUEUE
but there seems to be little point in not doing the
right thing for everyone, by default.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/net/tun.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 607a3a5..ad5c5fc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -693,21 +693,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * number of queues.
 	 */
 	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
-			  >= dev->tx_queue_len / tun->numqueues){
-		if (!(tun->flags & TUN_ONE_QUEUE)) {
-			/* Normal queueing mode. */
-			/* Packet scheduler handles dropping of further packets. */
-			netif_stop_subqueue(dev, txq);
-
-			/* We won't see all dropped packets individually, so overrun
-			 * error is more appropriate. */
-			dev->stats.tx_fifo_errors++;
-		} else {
-			/* Single queue mode.
-			 * Driver handles dropping of all packets itself. */
-			goto drop;
-		}
-	}
+			  >= dev->tx_queue_len / tun->numqueues)
+		goto drop;
 
 	/* Orphan the skb - required as we might hang on to it
 	 * for indefinite time. */
@@ -1322,7 +1309,6 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			schedule();
 			continue;
 		}
-		netif_wake_subqueue(tun->dev, tfile->queue_index);
 
 		ret = tun_put_user(tun, tfile, skb, iv, len);
 		kfree_skb(skb);
-- 
MST

^ permalink raw reply related

* Re: [PATCH v2] net/macb: Use non-coherent memory for rx buffers
From: Nicolas Ferre @ 2012-12-03 13:21 UTC (permalink / raw)
  To: David Laight
  Cc: David S. Miller, netdev, linux-arm-kernel, linux-kernel,
	Joachim Eastwood, Jean-Christophe PLAGNIOL-VILLARD,
	Havard Skinnemoen
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B70D5@saturn3.aculab.com>

On 12/03/2012 01:43 PM, David Laight :
>> Allocate regular pages to use as backing for the RX ring and use the
>> DMA API to sync the caches. This should give a bit better performance
>> since it allows the CPU to do burst transfers from memory. It is also
>> a necessary step on the way to reduce the amount of copying done by
>> the driver.
> 
> I've not tried to understand the patches, but you have to be
> very careful using non-snooped memory for descriptor rings.
> No amount of DMA API calls can sort out some of the issues.

David,

Maybe I have not described the patch properly but the non-coherent
memory is not used for descriptor rings. It is used for DMA buffers
pointed out by descriptors (that are allocated as coherent memory).

As buffers are filled up by the interface DMA and then, afterwards, used
by the driver to pass data to the net layer, it seems to me that the use
of non-coherent memory is sensible.

Do you still have reluctance with this patch?

Best regards,
-- 
Nicolas Ferre

^ permalink raw reply

* Re: [PATCH] atm: introduce vcc_pop_skb()
From: David Woodhouse @ 2012-12-03 13:22 UTC (permalink / raw)
  To: Krzysztof Mazur; +Cc: chas williams - CONTRACTOR, davem, netdev, linux-kernel
In-Reply-To: <20121128223300.GA12527@shrek.podlesie.net>

[-- Attachment #1: Type: text/plain, Size: 1190 bytes --]

On Wed, 2012-11-28 at 23:33 +0100, Krzysztof Mazur wrote:
> 
> Many ATM drivers store vcc in ATM_SKB(skb)->vcc and use it for
> freeing skbs. Now they can just use atm_pop_skb() to free such
> buffers.
> 
> Signed-off-by: Krzysztof Mazur <krzysiek@podlesie.net>

Note that this one didn't make it into the tree that Dave just pulled.

Not that I didn't think it was a good idea, but it was just separate
from the other "real" fixes — and the tree had already grown into a big
enough pile from your original single patch!

In <20121006154606.GA25588@shrek.podlesie.net> you posted another patch:
> I think there is another problem here. The pppoatm gets a reference
> to atmvcc, but I don't see anything that protects against removal
> of that vcc.
> 
> The vcc uses vcc->sk socket for reference counting, so sock_hold()
> and sock_put() should be used by pppoatm.

That one I think *isn't* needed, because we have properly fixed the
races with vcc_destroy_socket(). I just wanted to check you agree...?

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation




[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* [PATCH 1/3] net: cpsw: replace pr_xxx with dev_xxx functions
From: Jan Luebbe @ 2012-12-03 13:49 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Mugunthan V N, Vaibhav Hiremath,
	linux-arm-kernel, linux-omap, Jan Luebbe

Signed-off-by: Jan Luebbe <jlu@pengutronix.de>
---
 drivers/net/ethernet/ti/cpsw.c |   47 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index fb1a692..c0e676a 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -729,7 +729,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		return -EINVAL;
 
 	if (of_property_read_u32(node, "slaves", &prop)) {
-		pr_err("Missing slaves property in the DT.\n");
+		dev_err(&pdev->dev, "Missing slaves property in the DT.\n");
 		return -EINVAL;
 	}
 	data->slaves = prop;
@@ -737,91 +737,91 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	data->slave_data = kzalloc(sizeof(struct cpsw_slave_data) *
 				   data->slaves, GFP_KERNEL);
 	if (!data->slave_data) {
-		pr_err("Could not allocate slave memory.\n");
+		dev_err(&pdev->dev, "Could not allocate slave memory.\n");
 		return -EINVAL;
 	}
 
 	data->no_bd_ram = of_property_read_bool(node, "no_bd_ram");
 
 	if (of_property_read_u32(node, "cpdma_channels", &prop)) {
-		pr_err("Missing cpdma_channels property in the DT.\n");
+		dev_err(&pdev->dev, "Missing cpdma_channels property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->channels = prop;
 
 	if (of_property_read_u32(node, "host_port_no", &prop)) {
-		pr_err("Missing host_port_no property in the DT.\n");
+		dev_err(&pdev->dev, "Missing host_port_no property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->host_port_num = prop;
 
 	if (of_property_read_u32(node, "cpdma_reg_ofs", &prop)) {
-		pr_err("Missing cpdma_reg_ofs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing cpdma_reg_ofs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->cpdma_reg_ofs = prop;
 
 	if (of_property_read_u32(node, "cpdma_sram_ofs", &prop)) {
-		pr_err("Missing cpdma_sram_ofs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing cpdma_sram_ofs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->cpdma_sram_ofs = prop;
 
 	if (of_property_read_u32(node, "ale_reg_ofs", &prop)) {
-		pr_err("Missing ale_reg_ofs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing ale_reg_ofs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->ale_reg_ofs = prop;
 
 	if (of_property_read_u32(node, "ale_entries", &prop)) {
-		pr_err("Missing ale_entries property in the DT.\n");
+		dev_err(&pdev->dev, "Missing ale_entries property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->ale_entries = prop;
 
 	if (of_property_read_u32(node, "host_port_reg_ofs", &prop)) {
-		pr_err("Missing host_port_reg_ofs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing host_port_reg_ofs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->host_port_reg_ofs = prop;
 
 	if (of_property_read_u32(node, "hw_stats_reg_ofs", &prop)) {
-		pr_err("Missing hw_stats_reg_ofs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing hw_stats_reg_ofs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->hw_stats_reg_ofs = prop;
 
 	if (of_property_read_u32(node, "bd_ram_ofs", &prop)) {
-		pr_err("Missing bd_ram_ofs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing bd_ram_ofs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->bd_ram_ofs = prop;
 
 	if (of_property_read_u32(node, "bd_ram_size", &prop)) {
-		pr_err("Missing bd_ram_size property in the DT.\n");
+		dev_err(&pdev->dev, "Missing bd_ram_size property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->bd_ram_size = prop;
 
 	if (of_property_read_u32(node, "rx_descs", &prop)) {
-		pr_err("Missing rx_descs property in the DT.\n");
+		dev_err(&pdev->dev, "Missing rx_descs property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
 	data->rx_descs = prop;
 
 	if (of_property_read_u32(node, "mac_control", &prop)) {
-		pr_err("Missing mac_control property in the DT.\n");
+		dev_err(&pdev->dev, "Missing mac_control property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
@@ -833,14 +833,14 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		const void *mac_addr = NULL;
 
 		if (of_property_read_string(slave_node, "phy_id", &phy_id)) {
-			pr_err("Missing slave[%d] phy_id property\n", i);
+			dev_err(&pdev->dev, "Missing slave[%d] phy_id property.\n", i);
 			ret = -EINVAL;
 			goto error_ret;
 		}
 		slave_data->phy_id = phy_id;
 
 		if (of_property_read_u32(slave_node, "slave_reg_ofs", &prop)) {
-			pr_err("Missing slave[%d] slave_reg_ofs property\n", i);
+			dev_err(&pdev->dev, "Missing slave[%d] slave_reg_ofs property.\n", i);
 			ret = -EINVAL;
 			goto error_ret;
 		}
@@ -848,8 +848,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 
 		if (of_property_read_u32(slave_node, "sliver_reg_ofs",
 					 &prop)) {
-			pr_err("Missing slave[%d] sliver_reg_ofs property\n",
-				i);
+			dev_err(&pdev->dev, "Missing slave[%d] sliver_reg_ofs property.\n", i);
 			ret = -EINVAL;
 			goto error_ret;
 		}
@@ -868,7 +867,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	ret = of_platform_populate(node, NULL, NULL, &pdev->dev);
 	/* We do not want to force this, as in some cases may not have child */
 	if (ret)
-		pr_warn("Doesn't have any child node\n");
+		dev_warn(&pdev->dev, "Doesn't have any child node\n");
 
 	return 0;
 
@@ -890,7 +889,7 @@ static int __devinit cpsw_probe(struct platform_device *pdev)
 
 	ndev = alloc_etherdev(sizeof(struct cpsw_priv));
 	if (!ndev) {
-		pr_err("error allocating net_device\n");
+		pr_err("cpsw: error allocating net_device\n");
 		return -ENOMEM;
 	}
 
@@ -909,7 +908,7 @@ static int __devinit cpsw_probe(struct platform_device *pdev)
 	pm_runtime_enable(&pdev->dev);
 
 	if (cpsw_probe_dt(&priv->data, pdev)) {
-		pr_err("cpsw: platform data missing\n");
+		dev_err(&pdev->dev, "platform data missing\n");
 		ret = -ENODEV;
 		goto clean_ndev_ret;
 	}
@@ -917,10 +916,10 @@ static int __devinit cpsw_probe(struct platform_device *pdev)
 
 	if (is_valid_ether_addr(data->slave_data[0].mac_addr)) {
 		memcpy(priv->mac_addr, data->slave_data[0].mac_addr, ETH_ALEN);
-		pr_info("Detected MACID = %pM", priv->mac_addr);
+		dev_info(&pdev->dev, "Detected MACID = %pM", priv->mac_addr);
 	} else {
 		eth_random_addr(priv->mac_addr);
-		pr_info("Random MACID = %pM", priv->mac_addr);
+		dev_info(&pdev->dev, "Random MACID = %pM", priv->mac_addr);
 	}
 
 	memcpy(ndev->dev_addr, priv->mac_addr, ETH_ALEN);
@@ -1120,7 +1119,7 @@ static int __devexit cpsw_remove(struct platform_device *pdev)
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct cpsw_priv *priv = netdev_priv(ndev);
 
-	pr_info("removing device");
+	dev_info(&pdev->dev, "removing device");
 	platform_set_drvdata(pdev, NULL);
 
 	free_irq(ndev->irq, priv);
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 2/3] net: cpsw: verify correct number of slaves in DT
From: Jan Luebbe @ 2012-12-03 13:49 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Mugunthan V N, Vaibhav Hiremath,
	linux-arm-kernel, linux-omap, Jan Luebbe
In-Reply-To: <1354542569-6165-1-git-send-email-jlu@pengutronix.de>

Check that the number of available slaves passed from DT matches the
value of the "slaves" property in the cpsw node. Otherwise, priv->slaves
would be the wrong size.

Signed-off-by: Jan Luebbe <jlu@pengutronix.de>
---
 drivers/net/ethernet/ti/cpsw.c |   17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index c0e676a..8de3e92 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -832,6 +832,16 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		const char *phy_id = NULL;
 		const void *mac_addr = NULL;
 
+		if (!of_device_is_available(slave_node))
+			continue;
+
+		if (i >= data->slaves) {
+			dev_err(&pdev->dev, "Too many slaves in the DT (> %d).\n",
+				data->slaves);
+			ret = -EINVAL;
+			goto error_ret;
+		}
+
 		if (of_property_read_string(slave_node, "phy_id", &phy_id)) {
 			dev_err(&pdev->dev, "Missing slave[%d] phy_id property.\n", i);
 			ret = -EINVAL;
@@ -861,6 +871,13 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		i++;
 	}
 
+	if (i < data->slaves) {
+		dev_err(&pdev->dev, "Not enough slaves in the DT (< %d).\n",
+			data->slaves);
+		ret = -EINVAL;
+		goto error_ret;
+	}
+
 	/*
 	 * Populate all the child nodes here...
 	 */
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 3/3] net: cpsw: implement ioctl for MII
From: Jan Luebbe @ 2012-12-03 13:49 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Mugunthan V N, Vaibhav Hiremath,
	linux-arm-kernel, linux-omap, Jan Luebbe
In-Reply-To: <1354542569-6165-1-git-send-email-jlu@pengutronix.de>

This allows using tools like mii-diag on CPSW.

Signed-off-by: Jan Luebbe <jlu@pengutronix.de>
---
 drivers/net/ethernet/ti/cpsw.c |   15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 8de3e92..f476c03 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -629,6 +629,20 @@ static void cpsw_ndo_change_rx_flags(struct net_device *ndev, int flags)
 		dev_err(&ndev->dev, "multicast traffic cannot be filtered!\n");
 }
 
+static int cpsw_ndo_do_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct phy_device *phy = priv->slaves[0].phy;
+
+	if (!netif_running(ndev))
+		return -EINVAL;
+
+	if (!phy)
+		return -ENODEV;
+
+	return phy_mii_ioctl(phy, rq, cmd);
+}
+
 static void cpsw_ndo_tx_timeout(struct net_device *ndev)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
@@ -670,6 +684,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_start_xmit		= cpsw_ndo_start_xmit,
 	.ndo_change_rx_flags	= cpsw_ndo_change_rx_flags,
 	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_do_ioctl		= cpsw_ndo_do_ioctl,
 	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_tx_timeout		= cpsw_ndo_tx_timeout,
 	.ndo_get_stats		= cpsw_ndo_get_stats,
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH] net/macb: Use dmapool to align descriptors on 64bits
From: Ben Hutchings @ 2012-12-03 14:01 UTC (permalink / raw)
  To: Nicolas Ferre
  Cc: David S. Miller, netdev, linux-arm-kernel, linux-kernel,
	Joachim Eastwood, Jean-Christophe PLAGNIOL-VILLARD
In-Reply-To: <1354536914-6315-1-git-send-email-nicolas.ferre@atmel.com>

On Mon, 2012-12-03 at 13:15 +0100, Nicolas Ferre wrote:
> Depending on datapath, some revisions of GEM need
> 64bits aligned descriptors. Use dmapool to allocate
> these descriptors.
> Note that different size between RX and TX rings
> leads to the creation of two pools.
> 
> Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
[...]

dma_alloc_coherent() allocates whole pages, which I think is quite
enough alignment.  You can't save memory by doing this, since each pool
needs at least one page.  So what is this change meant to achieve?

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [net-next PATCH V2 5/9] net: frag, per CPU resource, mem limit and LRU list accounting
From: Jesper Dangaard Brouer @ 2012-12-03 14:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Florian Westphal, netdev, Pablo Neira Ayuso,
	Thomas Graf, Cong Wang, Patrick McHardy, Paul E. McKenney,
	Herbert Xu
In-Reply-To: <1354208776.14302.1898.camel@edumazet-glaptop>

On Thu, 2012-11-29 at 09:06 -0800, Eric Dumazet wrote:
> On Thu, 2012-11-29 at 17:13 +0100, Jesper Dangaard Brouer wrote:
> > The major performance bottleneck on NUMA systems, is the mem limit
> > counter which is based an atomic counter.  This patch removes the
> > cache-bouncing of the atomic counter, by moving this accounting to be
> > bound to each CPU.  The LRU list also need to be done per CPU,
> > in-order to keep the accounting straight.
> > 
> > If fragments belonging together is "sprayed" across CPUs, performance
> > will still suffer, but due to NIC rxhashing this is not very common.
> > Correct accounting in this situation is maintained by recording and
> > "assigning" a CPU to a frag queue when its allocated (caused by the
> > first packet associated packet).
> > 
[...]
> > +/* Need to maintain these resource limits per CPU, else we will kill
> > + * performance due to cache-line bouncing
> > + */
> > +struct frag_cpu_limit {
> > +	atomic_t                mem;
> > +	struct list_head        lru_list;
> > +	spinlock_t              lru_lock;
> > +} ____cacheline_aligned_in_smp;
> > +
> 
> This looks like a big patch introducing a specific infrastructure, while
> we already have lib/percpu_counter.c

For the record, I cannot use the lib/percpu_counter, because this
accounting is not kept strictly per CPU, if the fragments are "sprayed"
across CPUs (as described in the commit message above).

^ permalink raw reply

* Re: [PATCH net-next 3/7] ipv6: improve ipv6_find_hdr() to skip empty routing headers
From: Pablo Neira Ayuso @ 2012-12-03 14:04 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, netdev, dev, Ansis Atteka
In-Reply-To: <1354214149-33651-4-git-send-email-jesse@nicira.com>

On Thu, Nov 29, 2012 at 10:35:45AM -0800, Jesse Gross wrote:
> From: Ansis Atteka <aatteka@nicira.com>
> 
> This patch prepares ipv6_find_hdr() function so that it could be
> able to skip routing headers, where segements_left is 0. This is
> required to handle multiple routing header case correctly when
> changing IPv6 addresses.
> 
> Signed-off-by: Ansis Atteka <aatteka@nicira.com>
> Signed-off-by: Jesse Gross <jesse@nicira.com>
> ---
>  include/net/ipv6.h      |    5 +++--
>  net/ipv6/exthdrs_core.c |   36 ++++++++++++++++++++++++++++--------
>  2 files changed, 31 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> index b2f0cfb..acbd8e0 100644
> --- a/include/net/ipv6.h
> +++ b/include/net/ipv6.h
> @@ -631,8 +631,9 @@ extern int			ipv6_skip_exthdr(const struct sk_buff *, int start,
>  extern bool			ipv6_ext_hdr(u8 nexthdr);
>  
>  enum {
> -	IP6_FH_F_FRAG	= (1 << 0),
> -	IP6_FH_F_AUTH	= (1 << 1),
> +	IP6_FH_F_FRAG		= (1 << 0),
> +	IP6_FH_F_AUTH		= (1 << 1),
> +	IP6_FH_F_SKIP_RH	= (1 << 2),
>  };
>  
>  /* find specified header and get offset to it */
> diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
> index 8ea253a..11b4e29 100644
> --- a/net/ipv6/exthdrs_core.c
> +++ b/net/ipv6/exthdrs_core.c
> @@ -132,9 +132,11 @@ EXPORT_SYMBOL(ipv6_skip_exthdr);
>   * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
>   * isn't NULL.
>   *
> - * if flags is not NULL and it's a fragment, then the frag flag IP6_FH_F_FRAG
> - * will be set. If it's an AH header, the IP6_FH_F_AUTH flag is set and
> - * target < 0, then this function will stop at the AH header.
> + * if flags is not NULL and it's a fragment, then the frag flag
> + * IP6_FH_F_FRAG will be set. If it's an AH header, the
> + * IP6_FH_F_AUTH flag is set and target < 0, then this function will
> + * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this
> + * function will skip all those routing headers, where segements_left was 0.
>   */
>  int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
>  		  int target, unsigned short *fragoff, int *flags)
> @@ -142,6 +144,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
>  	unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
>  	u8 nexthdr = ipv6_hdr(skb)->nexthdr;
>  	unsigned int len;
> +	bool found;
>  
>  	if (fragoff)
>  		*fragoff = 0;
> @@ -159,9 +162,10 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
>  	}
>  	len = skb->len - start;
>  
> -	while (nexthdr != target) {

If the offset is set as parameter via ipv6_find_hdr, we now are always
entering the loop even if we found the target header we're looking
for, before that didn't happen.

Something seems wrong here to me.

> +	do {
>  		struct ipv6_opt_hdr _hdr, *hp;
>  		unsigned int hdrlen;
> +		found = (nexthdr == target);
>  
>  		if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
>  			if (target < 0)
> @@ -172,6 +176,20 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
>  		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
>  		if (hp == NULL)
>  			return -EBADMSG;
> +
> +		if (nexthdr == NEXTHDR_ROUTING) {
> +			struct ipv6_rt_hdr _rh, *rh;
> +
> +			rh = skb_header_pointer(skb, start, sizeof(_rh),
> +						&_rh);
> +			if (rh == NULL)
> +				return -EBADMSG;
> +
> +			if (flags && (*flags & IP6_FH_F_SKIP_RH) &&
> +			    rh->segments_left == 0)
> +				found = false;
> +		}
> +
>  		if (nexthdr == NEXTHDR_FRAGMENT) {
>  			unsigned short _frag_off;
>  			__be16 *fp;
> @@ -205,10 +223,12 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
>  		} else
>  			hdrlen = ipv6_optlen(hp);
>  
> -		nexthdr = hp->nexthdr;
> -		len -= hdrlen;
> -		start += hdrlen;
> -	}
> +		if (!found) {
> +			nexthdr = hp->nexthdr;
> +			len -= hdrlen;
> +			start += hdrlen;
> +		}
> +	} while (!found);
>  
>  	*offset = start;
>  	return nexthdr;
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/3] net: cpsw: implement ioctl for MII
From: Ben Hutchings @ 2012-12-03 14:16 UTC (permalink / raw)
  To: Jan Luebbe
  Cc: netdev, David S. Miller, Mugunthan V N, Vaibhav Hiremath,
	linux-arm-kernel, linux-omap
In-Reply-To: <1354542569-6165-3-git-send-email-jlu@pengutronix.de>

On Mon, 2012-12-03 at 14:49 +0100, Jan Luebbe wrote:
> This allows using tools like mii-diag on CPSW.
> 
> Signed-off-by: Jan Luebbe <jlu@pengutronix.de>
> ---
>  drivers/net/ethernet/ti/cpsw.c |   15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 8de3e92..f476c03 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -629,6 +629,20 @@ static void cpsw_ndo_change_rx_flags(struct net_device *ndev, int flags)
>  		dev_err(&ndev->dev, "multicast traffic cannot be filtered!\n");
>  }
>  
> +static int cpsw_ndo_do_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
> +{
> +	struct cpsw_priv *priv = netdev_priv(ndev);
> +	struct phy_device *phy = priv->slaves[0].phy;
> +
> +	if (!netif_running(ndev))
> +		return -EINVAL;

This is consistent with other drivers, and I'm not going to object to
one more instance, but I don't think this is the proper error code - we
don't know that there's anything wrong with the arguments, it's just
that the *device* is in the wrong state.  I don't know what the right
error code is, though.  Really I would prefer that MDIO would work even
when the device is down, unless you know that all MDIO-manageable
devices are powered off.

Ben.

> +	if (!phy)
> +		return -ENODEV;
> +
> +	return phy_mii_ioctl(phy, rq, cmd);
> +}
> +
>  static void cpsw_ndo_tx_timeout(struct net_device *ndev)
>  {
>  	struct cpsw_priv *priv = netdev_priv(ndev);
> @@ -670,6 +684,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
>  	.ndo_start_xmit		= cpsw_ndo_start_xmit,
>  	.ndo_change_rx_flags	= cpsw_ndo_change_rx_flags,
>  	.ndo_validate_addr	= eth_validate_addr,
> +	.ndo_do_ioctl		= cpsw_ndo_do_ioctl,
>  	.ndo_change_mtu		= eth_change_mtu,
>  	.ndo_tx_timeout		= cpsw_ndo_tx_timeout,
>  	.ndo_get_stats		= cpsw_ndo_get_stats,

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [patch net-next] net: call notifiers for mtu change even if iface is not up
From: Neil Horman @ 2012-12-03 14:18 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, edumazet, bhutchings, psimerda
In-Reply-To: <1354533392-9308-1-git-send-email-jiri@resnulli.us>

On Mon, Dec 03, 2012 at 12:16:32PM +0100, Jiri Pirko wrote:
> Do the same thing as in set mac. Call notifiers every time.
> 
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---
>  net/core/dev.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 2f94df2..0685a72 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4971,7 +4971,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
>  	else
>  		dev->mtu = new_mtu;
>  
> -	if (!err && dev->flags & IFF_UP)
> +	if (!err)
>  		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
>  	return err;
>  }

I'm not opposed to this change, but is there something that it expressly fixes?
While it doesn't hurt to send around mtu change events, one would presume that
listeners would pick up mtu changes when the NETDEV_UP event went' around.

Neil

^ permalink raw reply

* Re: [patch net-next] net: call notifiers for mtu change even if iface is not up
From: Jiri Pirko @ 2012-12-03 14:22 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, davem, edumazet, bhutchings, psimerda
In-Reply-To: <20121203141823.GA21816@hmsreliant.think-freely.org>

Mon, Dec 03, 2012 at 03:18:23PM CET, nhorman@tuxdriver.com wrote:
>On Mon, Dec 03, 2012 at 12:16:32PM +0100, Jiri Pirko wrote:
>> Do the same thing as in set mac. Call notifiers every time.
>> 
>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>> ---
>>  net/core/dev.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 2f94df2..0685a72 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -4971,7 +4971,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
>>  	else
>>  		dev->mtu = new_mtu;
>>  
>> -	if (!err && dev->flags & IFF_UP)
>> +	if (!err)
>>  		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
>>  	return err;
>>  }
>
>I'm not opposed to this change, but is there something that it expressly fixes?

This is about a consistency. To have the same behaviour as set_mac
for example.

>While it doesn't hurt to send around mtu change events, one would presume that
>listeners would pick up mtu changes when the NETDEV_UP event went' around.
>
>Neil
>
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [PATCH v2] net/macb: Use non-coherent memory for rx buffers
From: David Laight @ 2012-12-03 14:25 UTC (permalink / raw)
  To: Nicolas Ferre
  Cc: David S. Miller, netdev, linux-arm-kernel, linux-kernel,
	Joachim Eastwood, Jean-Christophe PLAGNIOL-VILLARD,
	Havard Skinnemoen
In-Reply-To: <50BCA746.1020705@atmel.com>

> On 12/03/2012 01:43 PM, David Laight :
> >> Allocate regular pages to use as backing for the RX ring and use the
> >> DMA API to sync the caches. This should give a bit better performance
> >> since it allows the CPU to do burst transfers from memory. It is also
> >> a necessary step on the way to reduce the amount of copying done by
> >> the driver.
> >
> > I've not tried to understand the patches, but you have to be
> > very careful using non-snooped memory for descriptor rings.
> > No amount of DMA API calls can sort out some of the issues.
> 
> David,
> 
> Maybe I have not described the patch properly but the non-coherent
> memory is not used for descriptor rings. It is used for DMA buffers
> pointed out by descriptors (that are allocated as coherent memory).
> 
> As buffers are filled up by the interface DMA and then, afterwards, used
> by the driver to pass data to the net layer, it seems to me that the use
> of non-coherent memory is sensible.

Ah, ok - difficult to actually determine from a fast read of the code.
So you invalidate (I think that is the right term) all the cache lines
that are part of each rx buffer before giving it back to the MAC unit.
(Maybe that first time, and just those cache lines that might have been
written to after reception - I'd worry about whether the CRC is written
into the rx buffer!)

I was wondering if the code needs to do per page allocations?
Perhaps that is necessary to avoid needing a large block of
contiguous physical memory (and virtual addresses)?

I know from some experiments done many years ago that a data
copy in the MAC tx and rx path isn't necessarily as bad as
people may think - especially if it removes complicated
'buffer loaning' schemes and/or iommu setup (or bounce
buffers due to limited hardware memory addressing).

The rx copy can usually be made to be a 'whole word' copy
(ie you copy the two bytes of garbage that (mis)align the
destination MAC address, and some bytes after the CRC.
With some hardware I believe it is possible for the cache
controller to do cache-line aligned copies very quickly!
(Some very new x86 cpus might be doing this for 'rep movsd'.)

The copy in the rx path is also better for short packets
the can end up queued for userspace (although a copy in
the socket code would solve that one.

	David

^ permalink raw reply

* [PATCH net-next] bridge: implement multicast fast leave
From: Cong Wang @ 2012-12-03 14:36 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Stephen Hemminger, bridge, David S. Miller, Cong Wang

Fast leave allows bridge to immediately stops the multicast
traffic on the port receives IGMP Leave when IGMP snooping is enabled,
no timeouts are observed.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>

---

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d53e4f4..05e0572 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1226,6 +1226,40 @@ static void br_multicast_leave_group(struct net_bridge *br,
 	if (!mp)
 		goto out;
 
+	if (br->multicast_fast_leave) {
+		struct net_bridge_port_group __rcu **pp;
+
+		if (!port) {
+			mp->mglist = false;
+
+			if (mp->ports)
+				goto out;
+
+			hlist_del_rcu(&mp->hlist[mdb->ver]);
+			mdb->size--;
+			del_timer(&mp->timer);
+			call_rcu_bh(&mp->rcu, br_multicast_free_group);
+			goto out;
+		}
+
+		for (pp = &mp->ports;
+		     (p = mlock_dereference(*pp, br)) != NULL;
+		     pp = &p->next) {
+			if (p->port != port)
+				continue;
+
+			rcu_assign_pointer(*pp, p->next);
+			hlist_del_init(&p->mglist);
+			del_timer(&p->timer);
+			call_rcu_bh(&p->rcu, br_multicast_free_pg);
+
+			if (!mp->ports && !mp->mglist &&
+			    netif_running(br->dev))
+				mod_timer(&mp->timer, jiffies);
+		}
+		goto out;
+	}
+
 	now = jiffies;
 	time = now + br->multicast_last_member_count *
 		     br->multicast_last_member_interval;
@@ -1567,6 +1601,7 @@ void br_multicast_init(struct net_bridge *br)
 	br->hash_max = 512;
 
 	br->multicast_router = 1;
+	br->multicast_fast_leave = 0;
 	br->multicast_querier = 0;
 	br->multicast_last_member_count = 2;
 	br->multicast_startup_query_count = 2;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6484069..2f5f5b8 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -230,6 +230,7 @@ struct net_bridge
 
 	u8				multicast_disabled:1;
 	u8				multicast_querier:1;
+	u8				multicast_fast_leave:1;
 
 	u32				hash_elasticity;
 	u32				hash_max;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 5913a3a..f88389f 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -375,6 +375,32 @@ static ssize_t store_multicast_snooping(struct device *d,
 static DEVICE_ATTR(multicast_snooping, S_IRUGO | S_IWUSR,
 		   show_multicast_snooping, store_multicast_snooping);
 
+static ssize_t show_multicast_fast_leave(struct device *d,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n", br->multicast_fast_leave);
+}
+
+static int set_fast_leave(struct net_bridge *br, unsigned long val)
+{
+	if (br->multicast_disabled)
+		return -EINVAL;
+
+	br->multicast_fast_leave = !!val;
+	return 0;
+}
+
+static ssize_t store_multicast_fast_leave(struct device *d,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_fast_leave);
+}
+static DEVICE_ATTR(multicast_fast_leave, S_IRUGO | S_IWUSR,
+		   show_multicast_fast_leave, store_multicast_fast_leave);
+
 static ssize_t show_multicast_querier(struct device *d,
 				      struct device_attribute *attr,
 				      char *buf)
@@ -715,6 +741,7 @@ static struct attribute *bridge_attrs[] = {
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&dev_attr_multicast_router.attr,
 	&dev_attr_multicast_snooping.attr,
+	&dev_attr_multicast_fast_leave.attr,
 	&dev_attr_multicast_querier.attr,
 	&dev_attr_hash_elasticity.attr,
 	&dev_attr_hash_max.attr,

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox