Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 net-next 06/15] net: sgi: ioc3-eth: get rid of ioc3_clean_rx_ring()
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

Move clearing of the descriptor valid bit into ioc3_alloc_rings. This
makes ioc3_clean_rx_ring obsolete.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index e84239734338..fe77e4d7732f 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -761,26 +761,6 @@ static void ioc3_mii_start(struct ioc3_private *ip)
 	add_timer(&ip->ioc3_timer);
 }
 
-static inline void ioc3_clean_rx_ring(struct ioc3_private *ip)
-{
-	struct ioc3_erxbuf *rxb;
-	struct sk_buff *skb;
-	int i;
-
-	for (i = ip->rx_ci; i & 15; i++) {
-		ip->rx_skbs[ip->rx_pi] = ip->rx_skbs[ip->rx_ci];
-		ip->rxr[ip->rx_pi++] = ip->rxr[ip->rx_ci++];
-	}
-	ip->rx_pi &= RX_RING_MASK;
-	ip->rx_ci &= RX_RING_MASK;
-
-	for (i = ip->rx_ci; i != ip->rx_pi; i = (i + 1) & RX_RING_MASK) {
-		skb = ip->rx_skbs[i];
-		rxb = (struct ioc3_erxbuf *)(skb->data - RX_OFFSET);
-		rxb->w0 = 0;
-	}
-}
-
 static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 {
 	struct sk_buff *skb;
@@ -838,6 +818,7 @@ static void ioc3_alloc_rings(struct net_device *dev)
 		/* Because we reserve afterwards. */
 		skb_put(skb, (1664 + RX_OFFSET));
 		rxb = (struct ioc3_erxbuf *)skb->data;
+		rxb->w0 = 0;	/* Clear valid flag */
 		ip->rxr[i] = cpu_to_be64(ioc3_map(rxb, 1));
 		skb_reserve(skb, RX_OFFSET);
 	}
@@ -857,7 +838,6 @@ static void ioc3_init_rings(struct net_device *dev)
 	ioc3_free_rings(ip);
 	ioc3_alloc_rings(dev);
 
-	ioc3_clean_rx_ring(ip);
 	ioc3_clean_tx_ring(ip);
 
 	/* Now the rx ring base, consume & produce registers.  */
-- 
2.13.7


^ permalink raw reply related

* [PATCH v3 net-next 11/15] net: sgi: ioc3-eth: use dma-direct for dma allocations
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

Replace the homegrown DMA memory allocation, which only works on
SGI-IP27 machines, with the generic dma allocations.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 146 ++++++++++++++++++++++++++++--------
 1 file changed, 113 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 7531944d2e95..ed8f997a3cec 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -36,7 +36,6 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
-#include <linux/dma-mapping.h>
 #include <linux/gfp.h>
 
 #ifdef CONFIG_SERIAL_8250
@@ -49,6 +48,8 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/dma-direct.h>
+
 #include <net/ip.h>
 
 #include <asm/byteorder.h>
@@ -64,10 +65,12 @@
 #define RX_BUFFS		64
 #define RX_RING_ENTRIES		512		/* fixed in hardware */
 #define RX_RING_MASK		(RX_RING_ENTRIES - 1)
+#define RX_RING_SIZE		(RX_RING_ENTRIES * sizeof(u64))
 
 /* 128 TX buffers (not tunable) */
 #define TX_RING_ENTRIES		128
 #define TX_RING_MASK		(TX_RING_ENTRIES - 1)
+#define TX_RING_SIZE		(TX_RING_ENTRIES * sizeof(struct ioc3_etxd))
 
 /* IOC3 does dma transfers in 128 byte blocks */
 #define IOC3_DMA_XFER_LEN	128UL
@@ -83,9 +86,12 @@
 struct ioc3_private {
 	struct ioc3_ethregs *regs;
 	struct ioc3 *all_regs;
+	struct device *dma_dev;
 	u32 *ssram;
 	unsigned long *rxr;		/* pointer to receiver ring */
 	struct ioc3_etxd *txr;
+	dma_addr_t rxr_dma;
+	dma_addr_t txr_dma;
 	struct sk_buff *rx_skbs[RX_RING_ENTRIES];
 	struct sk_buff *tx_skbs[TX_RING_ENTRIES];
 	int rx_ci;			/* RX consumer index */
@@ -125,9 +131,11 @@ static inline unsigned long aligned_rx_skb_addr(unsigned long addr)
 	return (~addr + 1) & (IOC3_DMA_XFER_LEN - 1UL);
 }
 
-static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
+static inline int ioc3_alloc_skb(struct ioc3_private *ip, struct sk_buff **skb,
+				 struct ioc3_erxbuf **rxb, dma_addr_t *rxb_dma)
 {
 	struct sk_buff *new_skb;
+	dma_addr_t d;
 	int offset;
 
 	new_skb = alloc_skb(RX_BUF_SIZE + IOC3_DMA_XFER_LEN - 1, GFP_ATOMIC);
@@ -139,6 +147,14 @@ static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
 	if (offset)
 		skb_reserve(new_skb, offset);
 
+	d = dma_map_single(ip->dma_dev, new_skb->data,
+			   RX_BUF_SIZE, DMA_FROM_DEVICE);
+
+	if (dma_mapping_error(ip->dma_dev, d)) {
+		dev_kfree_skb_any(new_skb);
+		return -ENOMEM;
+	}
+	*rxb_dma = d;
 	*rxb = (struct ioc3_erxbuf *)new_skb->data;
 	skb_reserve(new_skb, RX_OFFSET);
 	*skb = new_skb;
@@ -146,17 +162,22 @@ static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
 	return 0;
 }
 
-static inline unsigned long ioc3_map(void *ptr, unsigned long vdev)
+#ifdef CONFIG_PCI_XTALK_BRIDGE
+static inline unsigned long ioc3_map(dma_addr_t addr, unsigned long attr)
 {
-#ifdef CONFIG_SGI_IP27
-	vdev <<= 57;   /* Shift to PCI64_ATTR_VIRTUAL */
+	return (addr & ~PCI64_ATTR_BAR) | attr;
+}
 
-	return vdev | (0xaUL << PCI64_ATTR_TARG_SHFT) | PCI64_ATTR_PREF |
-	       ((unsigned long)ptr & TO_PHYS_MASK);
+#define ERBAR_VAL	(ERBAR_BARRIER_BIT << ERBAR_RXBARR_SHIFT)
 #else
-	return virt_to_bus(ptr);
-#endif
+static inline unsigned long ioc3_map(dma_addr_t addr, unsigned long attr)
+{
+	return addr;
 }
+
+#define ERBAR_VAL	0
+#endif
+
 #define IOC3_SIZE 0x100000
 
 static inline u32 mcr_pack(u32 pulse, u32 sample)
@@ -523,6 +544,7 @@ static inline void ioc3_rx(struct net_device *dev)
 	int rx_entry, n_entry, len;
 	struct ioc3_erxbuf *rxb;
 	unsigned long *rxr;
+	dma_addr_t d;
 	u32 w0, err;
 
 	rxr = ip->rxr;		/* Ring base */
@@ -540,12 +562,13 @@ static inline void ioc3_rx(struct net_device *dev)
 			skb_put(skb, len);
 			skb->protocol = eth_type_trans(skb, dev);
 
-			if (ioc3_alloc_skb(&new_skb, &rxb)) {
+			if (ioc3_alloc_skb(ip, &new_skb, &rxb, &d)) {
 				/* Ouch, drop packet and just recycle packet
 				 * to keep the ring filled.
 				 */
 				dev->stats.rx_dropped++;
 				new_skb = skb;
+				d = rxr[rx_entry];
 				goto next;
 			}
 
@@ -554,6 +577,9 @@ static inline void ioc3_rx(struct net_device *dev)
 						     w0 & ERXBUF_IPCKSUM_MASK,
 						     len);
 
+			dma_unmap_single(ip->dma_dev, rxr[rx_entry],
+					 RX_BUF_SIZE, DMA_FROM_DEVICE);
+
 			netif_rx(skb);
 
 			ip->rx_skbs[rx_entry] = NULL;	/* Poison  */
@@ -566,15 +592,17 @@ static inline void ioc3_rx(struct net_device *dev)
 			 * recycle it.
 			 */
 			new_skb = skb;
+			d = rxr[rx_entry];
 			dev->stats.rx_errors++;
 		}
 		if (err & ERXBUF_CRCERR)	/* Statistics */
 			dev->stats.rx_crc_errors++;
 		if (err & ERXBUF_FRAMERR)
 			dev->stats.rx_frame_errors++;
+
 next:
 		ip->rx_skbs[n_entry] = new_skb;
-		rxr[n_entry] = cpu_to_be64(ioc3_map(rxb, 1));
+		rxr[n_entry] = cpu_to_be64(ioc3_map(d, PCI64_ATTR_BAR));
 		rxb->w0 = 0;				/* Clear valid flag */
 		n_entry = (n_entry + 1) & RX_RING_MASK;	/* Update erpir */
 
@@ -767,6 +795,26 @@ static void ioc3_mii_start(struct ioc3_private *ip)
 	add_timer(&ip->ioc3_timer);
 }
 
+static inline void ioc3_tx_unmap(struct ioc3_private *ip, int entry)
+{
+	struct ioc3_etxd *desc;
+	u32 cmd, bufcnt, len;
+
+	desc = &ip->txr[entry];
+	cmd = be32_to_cpu(desc->cmd);
+	bufcnt = be32_to_cpu(desc->bufcnt);
+	if (cmd & ETXD_B1V) {
+		len = (bufcnt & ETXD_B1CNT_MASK) >> ETXD_B1CNT_SHIFT;
+		dma_unmap_single(ip->dma_dev, be64_to_cpu(desc->p1),
+				 len, DMA_TO_DEVICE);
+	}
+	if (cmd & ETXD_B2V) {
+		len = (bufcnt & ETXD_B2CNT_MASK) >> ETXD_B2CNT_SHIFT;
+		dma_unmap_single(ip->dma_dev, be64_to_cpu(desc->p2),
+				 len, DMA_TO_DEVICE);
+	}
+}
+
 static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 {
 	struct sk_buff *skb;
@@ -775,6 +823,7 @@ static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 	for (i = 0; i < TX_RING_ENTRIES; i++) {
 		skb = ip->tx_skbs[i];
 		if (skb) {
+			ioc3_tx_unmap(ip, i);
 			ip->tx_skbs[i] = NULL;
 			dev_kfree_skb_any(skb);
 		}
@@ -787,13 +836,19 @@ static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 static void ioc3_free_rx_bufs(struct ioc3_private *ip)
 {
 	int rx_entry, n_entry;
+	struct sk_buff *skb;
 
 	n_entry = ip->rx_ci;
 	rx_entry = ip->rx_pi;
 
 	while (n_entry != rx_entry) {
-		dev_kfree_skb_any(ip->rx_skbs[n_entry]);
-
+		skb = ip->rx_skbs[n_entry];
+		if (skb) {
+			dma_unmap_single(ip->dma_dev,
+					 be64_to_cpu(ip->rxr[n_entry]),
+					 RX_BUF_SIZE, DMA_FROM_DEVICE);
+			dev_kfree_skb_any(skb);
+		}
 		n_entry = (n_entry + 1) & RX_RING_MASK;
 	}
 }
@@ -802,6 +857,7 @@ static int ioc3_alloc_rx_bufs(struct net_device *dev)
 {
 	struct ioc3_private *ip = netdev_priv(dev);
 	struct ioc3_erxbuf *rxb;
+	dma_addr_t d;
 	int i;
 
 	/* Now the rx buffers.  The RX ring may be larger but
@@ -809,11 +865,11 @@ static int ioc3_alloc_rx_bufs(struct net_device *dev)
 	 * this for performance and memory later.
 	 */
 	for (i = 0; i < RX_BUFFS; i++) {
-		if (ioc3_alloc_skb(&ip->rx_skbs[i], &rxb))
+		if (ioc3_alloc_skb(ip, &ip->rx_skbs[i], &rxb, &d))
 			return -ENOMEM;
 
 		rxb->w0 = 0;	/* Clear valid flag */
-		ip->rxr[i] = cpu_to_be64(ioc3_map(rxb, 1));
+		ip->rxr[i] = cpu_to_be64(ioc3_map(d, PCI64_ATTR_BAR));
 	}
 	ip->rx_ci = 0;
 	ip->rx_pi = RX_BUFFS;
@@ -859,13 +915,7 @@ static void ioc3_init(struct net_device *dev)
 	readl(&regs->emcr);
 
 	/* Misc registers  */
-#ifdef CONFIG_SGI_IP27
-	/* Barrier on last store */
-	writel(PCI64_ATTR_BAR >> 32, &regs->erbar);
-#else
-	/* Let PCI API get it right */
-	writel(0, &regs->erbar);
-#endif
+	writel(ERBAR_VAL, &regs->erbar);
 	readl(&regs->etcdc);			/* Clear on read */
 	writel(15, &regs->ercsr);		/* RX low watermark  */
 	writel(0, &regs->ertr);			/* Interrupt immediately */
@@ -881,13 +931,13 @@ static void ioc3_start(struct ioc3_private *ip)
 	unsigned long ring;
 
 	/* Now the rx ring base, consume & produce registers.  */
-	ring = ioc3_map(ip->rxr, 0);
+	ring = ioc3_map(ip->rxr_dma, PCI64_ATTR_PREC);
 	writel(ring >> 32, &regs->erbr_h);
 	writel(ring & 0xffffffff, &regs->erbr_l);
 	writel(ip->rx_ci << 3, &regs->ercir);
 	writel((ip->rx_pi << 3) | ERPIR_ARM, &regs->erpir);
 
-	ring = ioc3_map(ip->txr, 0);
+	ring = ioc3_map(ip->txr_dma, PCI64_ATTR_PREC);
 
 	ip->txqlen = 0;					/* nothing queued  */
 
@@ -1161,6 +1211,7 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ip = netdev_priv(dev);
 	ip->dev = dev;
+	ip->dma_dev = &pdev->dev;
 
 	dev->irq = pdev->irq;
 
@@ -1187,7 +1238,8 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ioc3_stop(ip);
 
 	/* Allocate rx ring.  4kb = 512 entries, must be 4kb aligned */
-	ip->rxr = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+	ip->rxr = dma_direct_alloc_pages(ip->dma_dev, RX_RING_SIZE,
+					 &ip->rxr_dma, GFP_ATOMIC, 0);
 	if (!ip->rxr) {
 		pr_err("ioc3-eth: rx ring allocation failed\n");
 		err = -ENOMEM;
@@ -1195,7 +1247,9 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	/* Allocate tx rings.  16kb = 128 bufs, must be 16kb aligned  */
-	ip->txr = (struct ioc3_etxd *)__get_free_pages(GFP_KERNEL, 2);
+	ip->txr = dma_direct_alloc_pages(ip->dma_dev, TX_RING_SIZE,
+					 &ip->txr_dma,
+					 GFP_KERNEL | __GFP_ZERO, 0);
 	if (!ip->txr) {
 		pr_err("ioc3-eth: tx ring allocation failed\n");
 		err = -ENOMEM;
@@ -1255,9 +1309,11 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 out_stop:
 	del_timer_sync(&ip->ioc3_timer);
 	if (ip->rxr)
-		free_page((unsigned long)ip->rxr);
+		dma_direct_free_pages(ip->dma_dev, RX_RING_SIZE, ip->rxr,
+				      ip->rxr_dma, 0);
 	if (ip->txr)
-		free_pages((unsigned long)ip->txr, 2);
+		dma_direct_free_pages(ip->dma_dev, TX_RING_SIZE, ip->txr,
+				      ip->txr_dma, 0);
 out_res:
 	pci_release_regions(pdev);
 out_free:
@@ -1275,8 +1331,10 @@ static void ioc3_remove_one(struct pci_dev *pdev)
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct ioc3_private *ip = netdev_priv(dev);
 
-	free_page((unsigned long)ip->rxr);
-	free_pages((unsigned long)ip->txr, 2);
+	dma_direct_free_pages(ip->dma_dev, RX_RING_SIZE, ip->rxr,
+			      ip->rxr_dma, 0);
+	dma_direct_free_pages(ip->dma_dev, TX_RING_SIZE, ip->txr,
+			      ip->txr_dma, 0);
 
 	unregister_netdev(dev);
 	del_timer_sync(&ip->ioc3_timer);
@@ -1382,18 +1440,32 @@ static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		unsigned long b2 = (data | 0x3fffUL) + 1UL;
 		unsigned long s1 = b2 - data;
 		unsigned long s2 = data + len - b2;
+		dma_addr_t d1, d2;
 
 		desc->cmd    = cpu_to_be32(len | ETXD_INTWHENDONE |
 					   ETXD_B1V | ETXD_B2V | w0);
 		desc->bufcnt = cpu_to_be32((s1 << ETXD_B1CNT_SHIFT) |
 					   (s2 << ETXD_B2CNT_SHIFT));
-		desc->p1     = cpu_to_be64(ioc3_map(skb->data, 1));
-		desc->p2     = cpu_to_be64(ioc3_map((void *)b2, 1));
+		d1 = dma_map_single(ip->dma_dev, skb->data, s1, DMA_TO_DEVICE);
+		if (dma_mapping_error(ip->dma_dev, d1))
+			goto drop_packet;
+		d2 = dma_map_single(ip->dma_dev, (void *)b2, s1, DMA_TO_DEVICE);
+		if (dma_mapping_error(ip->dma_dev, d2)) {
+			dma_unmap_single(ip->dma_dev, d1, len, DMA_TO_DEVICE);
+			goto drop_packet;
+		}
+		desc->p1     = cpu_to_be64(ioc3_map(d1, PCI64_ATTR_PREF));
+		desc->p2     = cpu_to_be64(ioc3_map(d2, PCI64_ATTR_PREF));
 	} else {
+		dma_addr_t d;
+
 		/* Normal sized packet that doesn't cross a page boundary. */
 		desc->cmd = cpu_to_be32(len | ETXD_INTWHENDONE | ETXD_B1V | w0);
 		desc->bufcnt = cpu_to_be32(len << ETXD_B1CNT_SHIFT);
-		desc->p1     = cpu_to_be64(ioc3_map(skb->data, 1));
+		d = dma_map_single(ip->dma_dev, skb->data, len, DMA_TO_DEVICE);
+		if (dma_mapping_error(ip->dma_dev, d))
+			goto drop_packet;
+		desc->p1     = cpu_to_be64(ioc3_map(d, PCI64_ATTR_PREF));
 	}
 
 	mb(); /* make sure all descriptor changes are visible */
@@ -1411,6 +1483,14 @@ static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_unlock_irq(&ip->ioc3_lock);
 
 	return NETDEV_TX_OK;
+
+drop_packet:
+	dev_kfree_skb_any(skb);
+	dev->stats.tx_dropped++;
+
+	spin_unlock_irq(&ip->ioc3_lock);
+
+	return NETDEV_TX_OK;
 }
 
 static void ioc3_timeout(struct net_device *dev)
-- 
2.13.7


^ permalink raw reply related

* [PATCH v3 net-next 15/15] net: sgi: ioc3-eth: no need to stop queue set_multicast_list
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

netif_stop_queue()/netif_wake_qeue() aren't needed for changing
multicast filters.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 963ed0f9787c..deb636d653f3 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -1627,8 +1627,6 @@ static void ioc3_set_multicast_list(struct net_device *dev)
 	struct netdev_hw_addr *ha;
 	u64 ehar = 0;
 
-	netif_stop_queue(dev);				/* Lock out others. */
-
 	spin_lock_irq(&ip->ioc3_lock);
 
 	if (dev->flags & IFF_PROMISC) {			/* Set promiscuous.  */
@@ -1660,8 +1658,6 @@ static void ioc3_set_multicast_list(struct net_device *dev)
 	}
 
 	spin_unlock_irq(&ip->ioc3_lock);
-
-	netif_wake_queue(dev);			/* Let us get going again. */
 }
 
 module_pci_driver(ioc3_driver);
-- 
2.13.7


^ permalink raw reply related

* [PATCH v3 net-next 14/15] net: sgi: ioc3-eth: protect emcr in all cases
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

emcr in private struct wasn't always protected by spinlock.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 971986433d4c..963ed0f9787c 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -729,6 +729,8 @@ static inline void ioc3_setup_duplex(struct ioc3_private *ip)
 {
 	struct ioc3_ethregs *regs = ip->regs;
 
+	spin_lock_irq(&ip->ioc3_lock);
+
 	if (ip->mii.full_duplex) {
 		writel(ETCSR_FD, &regs->etcsr);
 		ip->emcr |= EMCR_DUPLEX;
@@ -737,6 +739,8 @@ static inline void ioc3_setup_duplex(struct ioc3_private *ip)
 		ip->emcr &= ~EMCR_DUPLEX;
 	}
 	writel(ip->emcr, &regs->emcr);
+
+	spin_unlock_irq(&ip->ioc3_lock);
 }
 
 static void ioc3_timer(struct timer_list *t)
@@ -1625,6 +1629,8 @@ static void ioc3_set_multicast_list(struct net_device *dev)
 
 	netif_stop_queue(dev);				/* Lock out others. */
 
+	spin_lock_irq(&ip->ioc3_lock);
+
 	if (dev->flags & IFF_PROMISC) {			/* Set promiscuous.  */
 		ip->emcr |= EMCR_PROMISC;
 		writel(ip->emcr, &regs->emcr);
@@ -1653,6 +1659,8 @@ static void ioc3_set_multicast_list(struct net_device *dev)
 		writel(ip->ehar_l, &regs->ehar_l);
 	}
 
+	spin_unlock_irq(&ip->ioc3_lock);
+
 	netif_wake_queue(dev);			/* Let us get going again. */
 }
 
-- 
2.13.7


^ permalink raw reply related

* [PATCH v3 net-next 04/15] net: sgi: ioc3-eth: use defines for constants dealing with desc rings
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

Descriptor ring sizes of the IOC3 are more or less fixed size. To
make clearer where there is a relation to ring sizes use defines.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 42 +++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 51cc1389e204..ba18a53fbbe6 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -61,10 +61,16 @@
 #include <asm/sn/ioc3.h>
 #include <asm/pci/bridge.h>
 
-/* 64 RX buffers.  This is tunable in the range of 16 <= x < 512.  The
- * value must be a power of two.
+/* Number of RX buffers.  This is tunable in the range of 16 <= x < 512.
+ * The value must be a power of two.
  */
-#define RX_BUFFS 64
+#define RX_BUFFS		64
+#define RX_RING_ENTRIES		512		/* fixed in hardware */
+#define RX_RING_MASK		(RX_RING_ENTRIES - 1)
+
+/* 128 TX buffers (not tunable) */
+#define TX_RING_ENTRIES		128
+#define TX_RING_MASK		(TX_RING_ENTRIES - 1)
 
 #define ETCSR_FD   ((17 << ETCSR_IPGR2_SHIFT) | (11 << ETCSR_IPGR1_SHIFT) | 21)
 #define ETCSR_HD   ((21 << ETCSR_IPGR2_SHIFT) | (21 << ETCSR_IPGR1_SHIFT) | 21)
@@ -76,8 +82,8 @@ struct ioc3_private {
 	u32 *ssram;
 	unsigned long *rxr;		/* pointer to receiver ring */
 	struct ioc3_etxd *txr;
-	struct sk_buff *rx_skbs[512];
-	struct sk_buff *tx_skbs[128];
+	struct sk_buff *rx_skbs[RX_RING_ENTRIES];
+	struct sk_buff *tx_skbs[TX_RING_ENTRIES];
 	int rx_ci;			/* RX consumer index */
 	int rx_pi;			/* RX producer index */
 	int tx_ci;			/* TX consumer index */
@@ -573,10 +579,10 @@ static inline void ioc3_rx(struct net_device *dev)
 		ip->rx_skbs[n_entry] = new_skb;
 		rxr[n_entry] = cpu_to_be64(ioc3_map(rxb, 1));
 		rxb->w0 = 0;				/* Clear valid flag */
-		n_entry = (n_entry + 1) & 511;		/* Update erpir */
+		n_entry = (n_entry + 1) & RX_RING_MASK;	/* Update erpir */
 
 		/* Now go on to the next ring entry.  */
-		rx_entry = (rx_entry + 1) & 511;
+		rx_entry = (rx_entry + 1) & RX_RING_MASK;
 		skb = ip->rx_skbs[rx_entry];
 		rxb = (struct ioc3_erxbuf *)(skb->data - RX_OFFSET);
 		w0 = be32_to_cpu(rxb->w0);
@@ -598,7 +604,7 @@ static inline void ioc3_tx(struct net_device *dev)
 	spin_lock(&ip->ioc3_lock);
 	etcir = readl(&regs->etcir);
 
-	tx_entry = (etcir >> 7) & 127;
+	tx_entry = (etcir >> 7) & TX_RING_MASK;
 	o_entry = ip->tx_ci;
 	packets = 0;
 	bytes = 0;
@@ -610,17 +616,17 @@ static inline void ioc3_tx(struct net_device *dev)
 		dev_consume_skb_irq(skb);
 		ip->tx_skbs[o_entry] = NULL;
 
-		o_entry = (o_entry + 1) & 127;		/* Next */
+		o_entry = (o_entry + 1) & TX_RING_MASK;	/* Next */
 
 		etcir = readl(&regs->etcir);		/* More pkts sent?  */
-		tx_entry = (etcir >> 7) & 127;
+		tx_entry = (etcir >> 7) & TX_RING_MASK;
 	}
 
 	dev->stats.tx_packets += packets;
 	dev->stats.tx_bytes += bytes;
 	ip->txqlen -= packets;
 
-	if (ip->txqlen < 128)
+	if (netif_queue_stopped(dev) && ip->txqlen < TX_RING_ENTRIES)
 		netif_wake_queue(dev);
 
 	ip->tx_ci = o_entry;
@@ -765,10 +771,10 @@ static inline void ioc3_clean_rx_ring(struct ioc3_private *ip)
 		ip->rx_skbs[ip->rx_pi] = ip->rx_skbs[ip->rx_ci];
 		ip->rxr[ip->rx_pi++] = ip->rxr[ip->rx_ci++];
 	}
-	ip->rx_pi &= 511;
-	ip->rx_ci &= 511;
+	ip->rx_pi &= RX_RING_MASK;
+	ip->rx_ci &= RX_RING_MASK;
 
-	for (i = ip->rx_ci; i != ip->rx_pi; i = (i + 1) & 511) {
+	for (i = ip->rx_ci; i != ip->rx_pi; i = (i + 1) & RX_RING_MASK) {
 		skb = ip->rx_skbs[i];
 		rxb = (struct ioc3_erxbuf *)(skb->data - RX_OFFSET);
 		rxb->w0 = 0;
@@ -780,7 +786,7 @@ static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 	struct sk_buff *skb;
 	int i;
 
-	for (i = 0; i < 128; i++) {
+	for (i = 0; i < TX_RING_ENTRIES; i++) {
 		skb = ip->tx_skbs[i];
 		if (skb) {
 			ip->tx_skbs[i] = NULL;
@@ -812,7 +818,7 @@ static void ioc3_free_rings(struct ioc3_private *ip)
 			if (skb)
 				dev_kfree_skb_any(skb);
 
-			n_entry = (n_entry + 1) & 511;
+			n_entry = (n_entry + 1) & RX_RING_MASK;
 		}
 		free_page((unsigned long)ip->rxr);
 		ip->rxr = NULL;
@@ -1425,13 +1431,13 @@ static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	mb(); /* make sure all descriptor changes are visible */
 
 	ip->tx_skbs[produce] = skb;			/* Remember skb */
-	produce = (produce + 1) & 127;
+	produce = (produce + 1) & TX_RING_MASK;
 	ip->tx_pi = produce;
 	writel(produce << 7, &ip->regs->etpir);		/* Fire ... */
 
 	ip->txqlen++;
 
-	if (ip->txqlen >= 127)
+	if (ip->txqlen >= (TX_RING_ENTRIES - 1))
 		netif_stop_queue(dev);
 
 	spin_unlock_irq(&ip->ioc3_lock);
-- 
2.13.7


^ permalink raw reply related

* [PATCH v3 net-next 10/15] net: sgi: ioc3-eth: refactor rx buffer allocation
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

Move common code for rx buffer setup into ioc3_alloc_skb and deal
with allocation failures. Also clean up allocation size calculation.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 95 ++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index fe8ee8f1c71f..7531944d2e95 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -11,11 +11,8 @@
  *
  * To do:
  *
- *  o Handle allocation failures in ioc3_alloc_skb() more gracefully.
- *  o Handle allocation failures in ioc3_init_rings().
  *  o Use prefetching for large packets.  What is a good lower limit for
  *    prefetching?
- *  o We're probably allocating a bit too much memory.
  *  o Use hardware checksums.
  *  o Convert to using a IOC3 meta driver.
  *  o Which PHYs might possibly be attached to the IOC3 in real live,
@@ -72,6 +69,13 @@
 #define TX_RING_ENTRIES		128
 #define TX_RING_MASK		(TX_RING_ENTRIES - 1)
 
+/* IOC3 does dma transfers in 128 byte blocks */
+#define IOC3_DMA_XFER_LEN	128UL
+
+/* Every RX buffer starts with 8 byte descriptor data */
+#define RX_OFFSET		(sizeof(struct ioc3_erxbuf) + NET_IP_ALIGN)
+#define RX_BUF_SIZE		(13 * IOC3_DMA_XFER_LEN)
+
 #define ETCSR_FD   ((17 << ETCSR_IPGR2_SHIFT) | (11 << ETCSR_IPGR1_SHIFT) | 21)
 #define ETCSR_HD   ((21 << ETCSR_IPGR2_SHIFT) | (21 << ETCSR_IPGR1_SHIFT) | 21)
 
@@ -108,36 +112,38 @@ static inline unsigned int ioc3_hash(const unsigned char *addr);
 static void ioc3_start(struct ioc3_private *ip);
 static inline void ioc3_stop(struct ioc3_private *ip);
 static void ioc3_init(struct net_device *dev);
-static void ioc3_alloc_rx_bufs(struct net_device *dev);
+static int ioc3_alloc_rx_bufs(struct net_device *dev);
 static void ioc3_free_rx_bufs(struct ioc3_private *ip);
 static inline void ioc3_clean_tx_ring(struct ioc3_private *ip);
 
 static const char ioc3_str[] = "IOC3 Ethernet";
 static const struct ethtool_ops ioc3_ethtool_ops;
 
-/* We use this to acquire receive skb's that we can DMA directly into. */
-
-#define IOC3_CACHELINE	128UL
 
 static inline unsigned long aligned_rx_skb_addr(unsigned long addr)
 {
-	return (~addr + 1) & (IOC3_CACHELINE - 1UL);
+	return (~addr + 1) & (IOC3_DMA_XFER_LEN - 1UL);
 }
 
-static inline struct sk_buff *ioc3_alloc_skb(unsigned long length,
-					     unsigned int gfp_mask)
+static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
 {
-	struct sk_buff *skb;
+	struct sk_buff *new_skb;
+	int offset;
 
-	skb = alloc_skb(length + IOC3_CACHELINE - 1, gfp_mask);
-	if (likely(skb)) {
-		int offset = aligned_rx_skb_addr((unsigned long)skb->data);
+	new_skb = alloc_skb(RX_BUF_SIZE + IOC3_DMA_XFER_LEN - 1, GFP_ATOMIC);
+	if (!new_skb)
+		return -ENOMEM;
 
-		if (offset)
-			skb_reserve(skb, offset);
-	}
+	/* ensure buffer is aligned to IOC3_DMA_XFER_LEN */
+	offset = aligned_rx_skb_addr((unsigned long)new_skb->data);
+	if (offset)
+		skb_reserve(new_skb, offset);
+
+	*rxb = (struct ioc3_erxbuf *)new_skb->data;
+	skb_reserve(new_skb, RX_OFFSET);
+	*skb = new_skb;
 
-	return skb;
+	return 0;
 }
 
 static inline unsigned long ioc3_map(void *ptr, unsigned long vdev)
@@ -151,13 +157,6 @@ static inline unsigned long ioc3_map(void *ptr, unsigned long vdev)
 	return virt_to_bus(ptr);
 #endif
 }
-
-/* BEWARE: The IOC3 documentation documents the size of rx buffers as
- * 1644 while it's actually 1664.  This one was nasty to track down ...
- */
-#define RX_OFFSET		10
-#define RX_BUF_ALLOC_SIZE	(1664 + RX_OFFSET + IOC3_CACHELINE)
-
 #define IOC3_SIZE 0x100000
 
 static inline u32 mcr_pack(u32 pulse, u32 sample)
@@ -538,11 +537,10 @@ static inline void ioc3_rx(struct net_device *dev)
 		err = be32_to_cpu(rxb->err);		/* It's valid ...  */
 		if (err & ERXBUF_GOODPKT) {
 			len = ((w0 >> ERXBUF_BYTECNT_SHIFT) & 0x7ff) - 4;
-			skb_trim(skb, len);
+			skb_put(skb, len);
 			skb->protocol = eth_type_trans(skb, dev);
 
-			new_skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
-			if (!new_skb) {
+			if (ioc3_alloc_skb(&new_skb, &rxb)) {
 				/* Ouch, drop packet and just recycle packet
 				 * to keep the ring filled.
 				 */
@@ -560,11 +558,6 @@ static inline void ioc3_rx(struct net_device *dev)
 
 			ip->rx_skbs[rx_entry] = NULL;	/* Poison  */
 
-			/* Because we reserve afterwards. */
-			skb_put(new_skb, (1664 + RX_OFFSET));
-			rxb = (struct ioc3_erxbuf *)new_skb->data;
-			skb_reserve(new_skb, RX_OFFSET);
-
 			dev->stats.rx_packets++;		/* Statistics */
 			dev->stats.rx_bytes += len;
 		} else {
@@ -667,7 +660,11 @@ static void ioc3_error(struct net_device *dev, u32 eisr)
 	ioc3_clean_tx_ring(ip);
 
 	ioc3_init(dev);
-	ioc3_alloc_rx_bufs(dev);
+	if (ioc3_alloc_rx_bufs(dev)) {
+		netdev_err(dev, "%s: rx buffer allocation failed\n", __func__);
+		spin_unlock(&ip->ioc3_lock);
+		return;
+	}
 	ioc3_start(ip);
 	ioc3_mii_init(ip);
 
@@ -801,7 +798,7 @@ static void ioc3_free_rx_bufs(struct ioc3_private *ip)
 	}
 }
 
-static void ioc3_alloc_rx_bufs(struct net_device *dev)
+static int ioc3_alloc_rx_bufs(struct net_device *dev)
 {
 	struct ioc3_private *ip = netdev_priv(dev);
 	struct ioc3_erxbuf *rxb;
@@ -812,25 +809,16 @@ static void ioc3_alloc_rx_bufs(struct net_device *dev)
 	 * this for performance and memory later.
 	 */
 	for (i = 0; i < RX_BUFFS; i++) {
-		struct sk_buff *skb;
-
-		skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
-		if (!skb) {
-			show_free_areas(0, NULL);
-			continue;
-		}
-
-		ip->rx_skbs[i] = skb;
+		if (ioc3_alloc_skb(&ip->rx_skbs[i], &rxb))
+			return -ENOMEM;
 
-		/* Because we reserve afterwards. */
-		skb_put(skb, (1664 + RX_OFFSET));
-		rxb = (struct ioc3_erxbuf *)skb->data;
 		rxb->w0 = 0;	/* Clear valid flag */
 		ip->rxr[i] = cpu_to_be64(ioc3_map(rxb, 1));
-		skb_reserve(skb, RX_OFFSET);
 	}
 	ip->rx_ci = 0;
 	ip->rx_pi = RX_BUFFS;
+
+	return 0;
 }
 
 static inline void ioc3_ssram_disc(struct ioc3_private *ip)
@@ -942,7 +930,10 @@ static int ioc3_open(struct net_device *dev)
 	ip->ehar_l = 0;
 
 	ioc3_init(dev);
-	ioc3_alloc_rx_bufs(dev);
+	if (ioc3_alloc_rx_bufs(dev)) {
+		netdev_err(dev, "%s: rx buffer allocation failed\n", __func__);
+		return -ENOMEM;
+	}
 	ioc3_start(ip);
 	ioc3_mii_start(ip);
 
@@ -1435,7 +1426,11 @@ static void ioc3_timeout(struct net_device *dev)
 	ioc3_clean_tx_ring(ip);
 
 	ioc3_init(dev);
-	ioc3_alloc_rx_bufs(dev);
+	if (ioc3_alloc_rx_bufs(dev)) {
+		netdev_err(dev, "%s: rx buffer allocation failed\n", __func__);
+		spin_unlock_irq(&ip->ioc3_lock);
+		return;
+	}
 	ioc3_start(ip);
 	ioc3_mii_init(ip);
 	ioc3_mii_start(ip);
-- 
2.13.7


^ permalink raw reply related

* [PATCH v3 net-next 13/15] net: sgi: ioc3-eth: Fix IPG settings
From: Thomas Bogendoerfer @ 2019-08-30  9:25 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190830092539.24550-1-tbogendoerfer@suse.de>

The half/full duplex settings for inter packet gap counters/timer were
reversed.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 05f4b598114c..971986433d4c 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -79,8 +79,8 @@
 #define RX_OFFSET		(sizeof(struct ioc3_erxbuf) + NET_IP_ALIGN)
 #define RX_BUF_SIZE		(13 * IOC3_DMA_XFER_LEN)
 
-#define ETCSR_FD   ((17 << ETCSR_IPGR2_SHIFT) | (11 << ETCSR_IPGR1_SHIFT) | 21)
-#define ETCSR_HD   ((21 << ETCSR_IPGR2_SHIFT) | (21 << ETCSR_IPGR1_SHIFT) | 21)
+#define ETCSR_FD   ((21 << ETCSR_IPGR2_SHIFT) | (21 << ETCSR_IPGR1_SHIFT) | 21)
+#define ETCSR_HD   ((17 << ETCSR_IPGR2_SHIFT) | (11 << ETCSR_IPGR1_SHIFT) | 21)
 
 /* Private per NIC data of the driver.  */
 struct ioc3_private {
-- 
2.13.7


^ permalink raw reply related

* Re: [RESEND PATCH 0/5] Add bluetooth support for Orange Pi 3
From: Maxime Ripard @ 2019-08-30  9:21 UTC (permalink / raw)
  To: Marcel Holtmann
  Cc: megous, Chen-Yu Tsai, Rob Herring, Johan Hedberg, Mark Rutland,
	David S. Miller, netdev, devicetree, linux-kernel,
	linux-arm-kernel, linux-bluetooth
In-Reply-To: <5524D5E9-FA82-4244-A91F-78CF1C3FB3FB@holtmann.org>

[-- Attachment #1: Type: text/plain, Size: 1512 bytes --]

Hi Marcel,

On Fri, Aug 30, 2019 at 09:53:16AM +0200, Marcel Holtmann wrote:
> > (Resend to add missing lists, sorry for the noise.)
> >
> > This series implements bluetooth support for Xunlong Orange Pi 3 board.
> >
> > The board uses AP6256 WiFi/BT 5.0 chip.
> >
> > Summary of changes:
> >
> > - add more delay to let initialize the chip
> > - let the kernel detect firmware file path
> > - add new compatible and update dt-bindings
> > - update Orange Pi 3 / H6 DTS
> >
> > Please take a look.
> >
> > thank you and regards,
> >  Ondrej Jirman
> >
> > Ondrej Jirman (5):
> >  dt-bindings: net: Add compatible for BCM4345C5 bluetooth device
> >  bluetooth: bcm: Add support for loading firmware for BCM4345C5
> >  bluetooth: hci_bcm: Give more time to come out of reset
> >  arm64: dts: allwinner: h6: Add pin configs for uart1
> >  arm64: dts: allwinner: orange-pi-3: Enable UART1 / Bluetooth
> >
> > .../bindings/net/broadcom-bluetooth.txt       |  1 +
> > .../dts/allwinner/sun50i-h6-orangepi-3.dts    | 19 +++++++++++++++++++
> > arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi  | 10 ++++++++++
> > drivers/bluetooth/btbcm.c                     |  3 +++
> > drivers/bluetooth/hci_bcm.c                   |  3 ++-
> > 5 files changed, 35 insertions(+), 1 deletion(-)
>
> all 5 patches have been applied to bluetooth-next tree.

The DTS patches (last 2) should go through the arm-soc tree, can you
drop them?

Thanks!
Maxime

--
Maxime Ripard, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 03/14] bnxt_en: Refactor bnxt_sriov_enable().
From: Leon Romanovsky @ 2019-08-30  9:18 UTC (permalink / raw)
  To: Michael Chan; +Cc: davem, netdev, vasundhara-v.volam, jiri, ray.jui
In-Reply-To: <20190826060045.GA4584@mtr-leonro.mtl.com>

On Mon, Aug 26, 2019 at 09:00:45AM +0300, Leon Romanovsky wrote:
> On Sun, Aug 25, 2019 at 11:54:54PM -0400, Michael Chan wrote:
> > Refactor the hardware/firmware configuration portion in
> > bnxt_sriov_enable() into a new function bnxt_cfg_hw_sriov().  This
> > new function can be called after a firmware reset to reconfigure the
> > VFs previously enabled.
>
> I wonder what does it mean for already bound VFs to vfio driver?
> Will you rebind them as well? Can I assume that FW error in one VF
> will trigger "restart" of other VFs too?

Care to reply?

hanks

>
> Thanks

^ permalink raw reply

* Re: [PATCH v2 1/6] mdev: Introduce sha1 based mdev alias
From: Cornelia Huck @ 2019-08-30  9:17 UTC (permalink / raw)
  To: Parav Pandit
  Cc: alex.williamson, jiri, kwankhede, davem, kvm, linux-kernel,
	netdev
In-Reply-To: <20190829111904.16042-2-parav@mellanox.com>

On Thu, 29 Aug 2019 06:18:59 -0500
Parav Pandit <parav@mellanox.com> wrote:

> Some vendor drivers want an identifier for an mdev device that is
> shorter than the UUID, due to length restrictions in the consumers of
> that identifier.
> 
> Add a callback that allows a vendor driver to request an alias of a
> specified length to be generated for an mdev device. If generated,
> that alias is checked for collisions.
> 
> It is an optional attribute.
> mdev alias is generated using sha1 from the mdev name.
> 
> Signed-off-by: Parav Pandit <parav@mellanox.com>
> 
> ---
> Changelog:
> v1->v2:
>  - Kept mdev_device naturally aligned
>  - Added error checking for crypt_*() calls
>  - Corrected a typo from 'and' to 'an'
>  - Changed return type of generate_alias() from int to char*
> v0->v1:
>  - Moved alias length check outside of the parent lock
>  - Moved alias and digest allocation from kvzalloc to kzalloc
>  - &alias[0] changed to alias
>  - alias_length check is nested under get_alias_length callback check
>  - Changed comments to start with an empty line
>  - Fixed cleaunup of hash if mdev_bus_register() fails
>  - Added comment where alias memory ownership is handed over to mdev device
>  - Updated commit log to indicate motivation for this feature
> ---
>  drivers/vfio/mdev/mdev_core.c    | 123 ++++++++++++++++++++++++++++++-
>  drivers/vfio/mdev/mdev_private.h |   5 +-
>  drivers/vfio/mdev/mdev_sysfs.c   |  13 ++--
>  include/linux/mdev.h             |   4 +
>  4 files changed, 135 insertions(+), 10 deletions(-)
> 

(...)

> +static const char *
> +generate_alias(const char *uuid, unsigned int max_alias_len)
> +{
> +	struct shash_desc *hash_desc;
> +	unsigned int digest_size;
> +	unsigned char *digest;
> +	unsigned int alias_len;
> +	char *alias;
> +	int ret;
> +
> +	/*
> +	 * Align to multiple of 2 as bin2hex will generate
> +	 * even number of bytes.
> +	 */
> +	alias_len = roundup(max_alias_len, 2);
> +	alias = kzalloc(alias_len + 1, GFP_KERNEL);

This function allocates alias...

> +	if (!alias)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/* Allocate and init descriptor */
> +	hash_desc = kvzalloc(sizeof(*hash_desc) +
> +			     crypto_shash_descsize(alias_hash),
> +			     GFP_KERNEL);
> +	if (!hash_desc) {
> +		ret = -ENOMEM;
> +		goto desc_err;
> +	}
> +
> +	hash_desc->tfm = alias_hash;
> +
> +	digest_size = crypto_shash_digestsize(alias_hash);
> +
> +	digest = kzalloc(digest_size, GFP_KERNEL);
> +	if (!digest) {
> +		ret = -ENOMEM;
> +		goto digest_err;
> +	}
> +	ret = crypto_shash_init(hash_desc);
> +	if (ret)
> +		goto hash_err;
> +
> +	ret = crypto_shash_update(hash_desc, uuid, UUID_STRING_LEN);
> +	if (ret)
> +		goto hash_err;
> +
> +	ret = crypto_shash_final(hash_desc, digest);
> +	if (ret)
> +		goto hash_err;
> +
> +	bin2hex(alias, digest, min_t(unsigned int, digest_size, alias_len / 2));
> +	/*
> +	 * When alias length is odd, zero out an additional last byte
> +	 * that bin2hex has copied.
> +	 */
> +	if (max_alias_len % 2)
> +		alias[max_alias_len] = 0;
> +
> +	kfree(digest);
> +	kvfree(hash_desc);
> +	return alias;

...and returns it here on success...

> +
> +hash_err:
> +	kfree(digest);
> +digest_err:
> +	kvfree(hash_desc);
> +desc_err:
> +	kfree(alias);
> +	return ERR_PTR(ret);
> +}
> +
> +int mdev_device_create(struct kobject *kobj, struct device *dev,
> +		       const char *uuid_str, const guid_t *uuid)
>  {
>  	int ret;
>  	struct mdev_device *mdev, *tmp;
>  	struct mdev_parent *parent;
>  	struct mdev_type *type = to_mdev_type(kobj);
> +	const char *alias = NULL;
>  
>  	parent = mdev_get_parent(type->parent);
>  	if (!parent)
>  		return -EINVAL;
>  
> +	if (parent->ops->get_alias_length) {
> +		unsigned int alias_len;
> +
> +		alias_len = parent->ops->get_alias_length();
> +		if (alias_len) {
> +			alias = generate_alias(uuid_str, alias_len);

...to be saved into a local variable here...

> +			if (IS_ERR(alias)) {
> +				ret = PTR_ERR(alias);
> +				goto alias_fail;
> +			}
> +		}
> +	}
>  	mutex_lock(&mdev_list_lock);
>  
>  	/* Check for duplicate */
> @@ -300,6 +398,12 @@ int mdev_device_create(struct kobject *kobj,
>  	}
>  
>  	guid_copy(&mdev->uuid, uuid);
> +	mdev->alias = alias;

...and reassigned to the mdev member here...

> +	/*
> +	 * At this point alias memory is owned by the mdev.
> +	 * Mark it NULL, so that only mdev can free it.
> +	 */
> +	alias = NULL;

...and detached from the local variable here. Who is freeing it? The
comment states that it is done by the mdev, but I don't see it?

This detour via the local variable looks weird to me. Can you either
create the alias directly in the mdev (would need to happen later in
the function, but I'm not sure why you generate the alias before
checking for duplicates anyway), or do an explicit copy?

>  	list_add(&mdev->next, &mdev_list);
>  	mutex_unlock(&mdev_list_lock);
>  
> @@ -346,6 +450,8 @@ int mdev_device_create(struct kobject *kobj,
>  	up_read(&parent->unreg_sem);
>  	put_device(&mdev->dev);
>  mdev_fail:
> +	kfree(alias);
> +alias_fail:
>  	mdev_put_parent(parent);
>  	return ret;
>  }

(...)

^ permalink raw reply

* Re: [PATCH net] dev: Delay the free of the percpu refcount
From: Eric Dumazet @ 2019-08-30  9:16 UTC (permalink / raw)
  To: Subash Abhinov Kasiviswanathan, eric.dumazet, davem, netdev
  Cc: Sean Tranchetti
In-Reply-To: <1567142596-25923-1-git-send-email-subashab@codeaurora.org>



On 8/30/19 7:23 AM, Subash Abhinov Kasiviswanathan wrote:
> While running stress-ng on an ARM64 kernel, the following oops
> was observedi -
> 
> 44837.761523:   <6> Unable to handle kernel paging request at
>                      virtual address 0000004a88287000
> 44837.761651:   <2> pc : in_dev_finish_destroy+0x4c/0xc8
> 44837.761654:   <2> lr : in_dev_finish_destroy+0x2c/0xc8
> 44837.762393:   <2> Call trace:
> 44837.762398:   <2>  in_dev_finish_destroy+0x4c/0xc8
> 44837.762404:   <2>  in_dev_rcu_put+0x24/0x30
> 44837.762412:   <2>  rcu_nocb_kthread+0x43c/0x468
> 44837.762418:   <2>  kthread+0x118/0x128
> 44837.762424:   <2>  ret_from_fork+0x10/0x1c
> 
> Prior to this, it appeared as if some of the inet6_dev allocations
> were failing. From the memory dump, the last operation performed
> was dev_put(), however the pcpu_refcnt was NULL while the
> reg_state = NETREG_RELEASED. Effectively, the refcount memory was
> freed in free_netdev() before the last reference was dropped.
> 
> Fix this by freeing the memory after all references are dropped and
> before the dev memory itself is freed.
> 
> Fixes: 29b4433d991c ("net: percpu net_device refcount")
> Cc: Sean Tranchetti <stranche@codeaurora.org>
> Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> ---
>  net/core/dev.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 49589ed..bce40d8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -9128,6 +9128,9 @@ void netdev_freemem(struct net_device *dev)
>  {
>  	char *addr = (char *)dev - dev->padded;
>  
> +	free_percpu(dev->pcpu_refcnt);
> +	dev->pcpu_refcnt = NULL;
> +
>  	kvfree(addr);
>  }
>  
> @@ -9272,9 +9275,6 @@ void free_netdev(struct net_device *dev)
>  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
>  		netif_napi_del(p);
>  
> -	free_percpu(dev->pcpu_refcnt);
> -	dev->pcpu_refcnt = NULL;
> -
>  	/*  Compatibility with error handling in drivers */
>  	if (dev->reg_state == NETREG_UNINITIALIZED) {
>  		netdev_freemem(dev);
> 

This looks bogus.

Whatever layer tries to access dev refcnt after free_netdev() has been called is buggy.

I would rather trap early and fix the root cause.

Untested patch :

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b5d28dadf964..8080f1305417 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3723,6 +3723,7 @@ void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
+       BUG_ON(!dev->pcpu_refcnt);
        this_cpu_dec(*dev->pcpu_refcnt);
 }
 
@@ -3734,6 +3735,7 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
+       BUG_ON(!dev->pcpu_refcnt);
        this_cpu_inc(*dev->pcpu_refcnt);
 }
 


^ permalink raw reply related

* [PATCH][V2] wimax/i2400m: remove debug containing bogus calculation of index
From: Colin King @ 2019-08-30  9:07 UTC (permalink / raw)
  To: Inaky Perez-Gonzalez, linux-wimax, David S . Miller, netdev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

The subtraction of the two pointers is automatically scaled by the
size of the size of the object the pointers point to, so the division
by sizeof(*i2400m->barker) is incorrect.  This has been broken since
day one of the driver and is only debug, so remove the debug completely.

Also move && in condition to clean up a checkpatch warning.

Addresses-Coverity: ("Extra sizeof expression")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---

V2: completely remove debug, clean up checkpatch warning, change subject line

---
 drivers/net/wimax/i2400m/fw.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wimax/i2400m/fw.c b/drivers/net/wimax/i2400m/fw.c
index 489cba9b284d..6c9a41bff2e0 100644
--- a/drivers/net/wimax/i2400m/fw.c
+++ b/drivers/net/wimax/i2400m/fw.c
@@ -397,14 +397,9 @@ int i2400m_is_boot_barker(struct i2400m *i2400m,

 	/* Short circuit if we have already discovered the barker
 	 * associated with the device. */
-	if (i2400m->barker
-	    && !memcmp(buf, i2400m->barker, sizeof(i2400m->barker->data))) {
-		unsigned index = (i2400m->barker - i2400m_barker_db)
-			/ sizeof(*i2400m->barker);
-		d_printf(2, dev, "boot barker cache-confirmed #%u/%08x\n",
-			 index, le32_to_cpu(i2400m->barker->data[0]));
+	if (i2400m->barker &&
+	    !memcmp(buf, i2400m->barker, sizeof(i2400m->barker->data)))
 		return 0;
-	}

 	for (i = 0; i < i2400m_barker_db_used; i++) {
 		barker = &i2400m_barker_db[i];
-- 
2.20.1

^ permalink raw reply related

* Re: [PATCH 0/4 net-next] flow_offload: update mangle action representation
From: Pablo Neira Ayuso @ 2019-08-30  9:07 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netfilter-devel, davem, netdev, vishal, saeedm, jiri
In-Reply-To: <20190829185448.0b502af8@cakuba.netronome.com>

On Thu, Aug 29, 2019 at 06:54:48PM -0700, Jakub Kicinski wrote:
> On Fri, 30 Aug 2019 02:53:32 +0200, Pablo Neira Ayuso wrote:
> > * Offsets do not need to be on the 32-bits boundaries anymore. This
> >   patchset adds front-end code to adjust the offset and length coming
> >   from the tc pedit representation, so drivers get an exact header field
> >   offset and length.
> 
> But drivers use offsetof(start of field) to match headers, and I don't
> see you changing that. So how does this work then?

Drivers can only use offsetof() for fields that are on the 32-bits
boundary.

Before this patchset, if you want to mangle the destination port, then
the driver needs to refer to the source port offset and the length is
4 bytes, so the mask is telling what needs to be mangled.

After this patchset, the offset is set to the destination port, the
length is set to 2-bytes, and the mask is telling what bytes of the
destination port field you specifically want to update.

It's just 100 LOC of preprocessing that is simplifying driver
codebase.

> Say - I want to change the second byte of an IPv4 address.

Then, the front-end sets the offset to IPv4 address header field, and
the mask tells what to update.

> > * The front-end coalesces consecutive pedit actions into one single
> >   word, so drivers can mangle IPv6 and ethernet address fields in one
> >   single go.
> 
> You still only coalesce up to 16 bytes, no?

You only have to rise FLOW_ACTION_MANGLE_MAXLEN coming in this patch
if you need more. I don't know of any packet field larger than 16
bytes. If there is a use-case for this, it should be easy to rise that
definition.

> As I said previously drivers will continue to implement mangle action
> merge code if that's the case. It'd be nice if core did the coalescing,
> and mark down first and last action, in case there is a setup cost for
> rewrite group.

In this patchset, the core front-end is doing the coalescing.

^ permalink raw reply

* Re: auto-split of commit. Was: [PATCH bpf-next 04/10] tools/bpf: add libbpf_prog_type_(from|to)_str helpers
From: Toke Høiland-Jørgensen @ 2019-08-30  9:01 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Alexei Starovoitov
  Cc: Jakub Kicinski, Julia Kartseva, ast, Thomas Gleixner, rdna, bpf,
	daniel, netdev, kernel-team
In-Reply-To: <20190830064722.GJ15257@kroah.com>

Greg Kroah-Hartman <gregkh@linuxfoundation.org> writes:

> On Thu, Aug 29, 2019 at 10:16:56AM -0700, Alexei Starovoitov wrote:
>> On Thu, Aug 29, 2019 at 08:51:51AM +0200, Greg Kroah-Hartman wrote:
>> > On Wed, Aug 28, 2019 at 04:46:28PM -0700, Alexei Starovoitov wrote:
>> > > On Wed, Aug 28, 2019 at 04:34:22PM -0700, Jakub Kicinski wrote:
>> > > > 
>> > > > Greg, Thomas, libbpf is extracted from the kernel sources and
>> > > > maintained in a clone repo on GitHub for ease of packaging.
>> > > > 
>> > > > IIUC Alexei's concern is that since we are moving the commits from
>> > > > the kernel repo to the GitHub one we have to preserve the commits
>> > > > exactly as they are, otherwise SOB lines lose their power.
>> > > > 
>> > > > Can you provide some guidance on whether that's a valid concern, 
>> > > > or whether it's perfectly fine to apply a partial patch?
>> > > 
>> > > Right. That's exactly the concern.
>> > > 
>> > > Greg, Thomas,
>> > > could you please put your legal hat on and clarify the following.
>> > > Say some developer does a patch that modifies
>> > > include/uapi/linux/bpf.h
>> > > ..some other kernel code...and
>> > > tools/include/uapi/linux/bpf.h
>> > > 
>> > > That tools/include/uapi/linux/bpf.h is used by perf and by libbpf.
>> > > We have automatic mirror of tools/libbpf into github/libbpf/
>> > > so that external projects and can do git submodule of it,
>> > > can build packages out of it, etc.
>> > > 
>> > > The question is whether it's ok to split tools/* part out of
>> > > original commit, keep Author and SOB, create new commit out of it,
>> > > and automatically push that auto-generated commit into github mirror.
>> > 
>> > Note, I am not a laywer, and am not _your_ lawyer either, only _your_
>> > lawyer can answer questions as to what is best for you.
>> > 
>> > That being said, from a "are you keeping the correct authorship info",
>> > yes, it sounds like you are doing the correct thing here.
>> > 
>> > Look at what I do for stable kernels, I take the original commit and add
>> > it to "another tree" keeping the original author and s-o-b chain intact,
>> > and adding a "this is the original git commit id" type message to the
>> > changelog text so that people can link it back to the original.
>> 
>> I think you're describing 'git cherry-pick -x'.
>
> Well, my scripts don't use git, but yes, it's much the same thing :)
>
>> The question was about taking pieces of the original commit. Not the whole commit.
>> Author field obviously stays, but SOB is questionable.
>
> sob matters to the file the commit is touching, and if it is identical
> to the original file (including same license), then it should be fine.
>
>> If author meant to change X and Y and Z. Silently taking only Z chunk of the diff
>> doesn't quite seem right.
>
> It can be confusing, I agree.
>
>> If we document that such commit split happens in Documentation/bpf/bpf_devel_QA.rst
>> do you think it will be enough to properly inform developers?
>> The main concern is the surprise factor when people start seeing their commits
>> in the mirror, but not their full commits.
>
> Personally, I wouldn't care, but maybe you should just enforce the fact
> that the original patch should ONLY touch Z, and not X and Y in the same
> patch, to make this a lot more obvious.
>
> Patches should only be doing "one logical thing" in the first place, but
> maybe you also need to touch other things when doing a change that you
> can't do this, I really do not know, sorry.

As someone who has been asked to split otherwise logically consistent
patches to accommodate the sync, I would very much appreciate it if the
process could be set up so this was not necessary :)

-Toke

^ permalink raw reply

* Re: [PATCH net-next v2 3/3] net: tls: export protocol version, cipher, tx_conf/rx_conf to socket diag
From: Davide Caratti @ 2019-08-30  8:56 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: borisp, Eric Dumazet, aviadye, davejwatson, davem, john.fastabend,
	Matthieu Baerts, netdev
In-Reply-To: <20190829145642.3f3de3ae@cakuba.netronome.com>

On Thu, 2019-08-29 at 14:56 -0700, Jakub Kicinski wrote:
> On Thu, 29 Aug 2019 18:48:04 +0200, Davide Caratti wrote:

[...]
> > @@ -431,6 +431,25 @@ static inline bool is_tx_ready(struct tls_sw_context_tx *ctx)
> >  	return READ_ONCE(rec->tx_ready);
> >  }
> >  
> > +static inline u16 tls_user_config(struct tls_context *ctx, bool tx)
> > +{
> > +	u16 config = tx ? ctx->tx_conf : ctx->rx_conf;
> > +
> > +	switch (config) {
> > +	case TLS_BASE:
> > +		return TLS_CONF_BASE;
> > +	case TLS_SW:
> > +		return TLS_CONF_SW;
> > +#ifdef CONFIG_TLS_DEVICE
> 
> Recently the TLS_HW define was taken out of the ifdef, so the ifdef
> around this is no longer necessary.

since the value of 'ctx->tx_conf' is always assigned/compared to 'TLS_HW'
under #ifdef CONFIG_TLS_DEVICE, the diag code will never reach that label 
when CONFIG_TLS_DEVICE is unset.
On the other hand, I'm ok for avoiding the #ifdefs unless they are really 
needed _ and probably IS_ENABLED() won't improve anything here, so I will 
just remove the #ifdef in series v3.

[...]

> > @@ -835,6 +836,67 @@ static void tls_update(struct sock *sk, struct proto *p)
> >  	}
> >  }
> >  
> > +static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
> > +{
> > +	struct tls_context *ctx;
> > +	u16 version, cipher_type;
> 
> Unfortunately revere christmas tree will be needed :(

that's due :) I will fix in series v3.

thanks!
-- 
davide



^ permalink raw reply

* Re: Is bug 200755 in anyone's queue??
From: Eric Dumazet @ 2019-08-30  8:54 UTC (permalink / raw)
  To: Willem de Bruijn, Steve Zabele
  Cc: Network Development, shum, vladimir116, saifi.khan, saifi.khan,
	Daniel Borkmann, on2k16nm, Stephen Hemminger
In-Reply-To: <CA+FuTSdu5inPWp_jkUcFnb-Fs-rdk0AMiieCYtjLE7Qs5oFWZQ@mail.gmail.com>



On 8/29/19 9:26 PM, Willem de Bruijn wrote:

> SO_REUSEPORT was not intended to be used in this way. Opening
> multiple connected sockets with the same local port.
> 
> But since the interface allowed connect after joining a group, and
> that is being used, I guess that point is moot. Still, I'm a bit
> surprised that it ever worked as described.
> 
> Also note that the default distribution algorithm is not round robin
> assignment, but hash based. So multiple consecutive datagrams arriving
> at the same socket is not unexpected.
> 
> I suspect that this quick hack might "work". It seemed to on the
> supplied .c file:
> 
>                   score = compute_score(sk, net, saddr, sport,
>                                         daddr, hnum, dif, sdif);
>                   if (score > badness) {
>   -                       if (sk->sk_reuseport) {
>   +                       if (sk->sk_reuseport && !sk->sk_state !=
> TCP_ESTABLISHED) {
> 
> But a more robust approach, that also works on existing kernels, is to
> swap the default distribution algorithm with a custom BPF based one (
> SO_ATTACH_REUSEPORT_EBPF).
> 

Yes, I suspect that reuseport could still be used by to load-balance incoming packets
targetting the same 4-tuple.

So all sockets would have the same score, and we would select the first socket in
the list (if not applying reuseport hashing)


^ permalink raw reply

* RE: Is bug 200755 in anyone's queue??
From: Steve Zabele @ 2019-08-30  8:53 UTC (permalink / raw)
  To: 'Steve Zabele', 'Willem de Bruijn'
  Cc: 'Network Development', shum, vladimir116, saifi.khan,
	'Daniel Borkmann', on2k16nm, 'Stephen Hemminger',
	mark.keaton
In-Reply-To: <CA+FuTSdu5inPWp_jkUcFnb-Fs-rdk0AMiieCYtjLE7Qs5oFWZQ@mail.gmail.com>

Resending since the last send bounced with this error

The following recipient(s) cannot be reached:

      'saifi.khan@datasynergy.org' on 8/30/2019 4:49 AM
            550 5.1.1 <saifi.khan@datasynergy.org> recipient invalid domain

Sorry for the spam.

Steve

-----Original Message-----
From: Steve Zabele [mailto:zabele@comcast.net] 
Sent: Friday, August 30, 2019 4:49 AM
To: 'Willem de Bruijn'
Cc: 'Network Development'; 'shum@canndrew.org'; 'vladimir116@gmail.com'; 'saifi.khan@datasynergy.org'; 'saifi.khan@strikr.in'; 'Daniel Borkmann'; 'on2k16nm@gmail.com'; 'Stephen Hemminger'; 'mark.keaton@raytheon.com'
Subject: RE: Is bug 200755 in anyone's queue??

Hi Willem!

**Thank you** for the reply and the code segment, very much appreciated.

Can we expect that this will make its way into a near-term official release of the kernel? Our customers are really not up to patching and rebuilding kernels, plus it "taints" the kernel from a security perspective, and whenever there is a new release of the kernel (you come in one morning and your kernel has been magically upgraded for you because you forgot to disable auto updates) you need to rebuild and hope that the previous patch is still good for the new code, etc, etc.

Getting this onto the main branch as part of the official release cycle will be greatly appreciated!

Note that using an ebpf approach can't solve this problem (we know because we tried for quite a while to make it work, no luck). The key issue is that at the point when the ebpf filter gets the packet buffer reference it is pointing to the start of the UDP portion of the packet, and hence is not able to access the IP source address which is earlier in the buffer. Plus every time a new socket is opened or closed, a new epbf has to be created and inserted -- and there is really no good way to figure out which index is (now) associated with which file descriptor.. 

So thank you and the group for your attention to this.

With respect to your comment

>SO_REUSEPORT was not intended to be used in this way. Opening
>multiple connected sockets with the same local port.

I'd like to offer that there are a number of reliable transport protocols (alternatives to TCP) that use UDP. NORM (IETF RFC 5470) and Google's new QUIC protocol (https://www.ietf.org/blog/whats-happening-quic) are good examples.

Now consider that users of these protocols will want to create servers using these protocols -- a webserver is a good example. In fact Google has one running on QUIC, and many Chrome users don't even know they are using QUIC when they access Google webservers.

With a client-server model, clients contact the server at a well known server address and port. Upon first contact from a new client, the server opens another socket with the same local address and port and "connects" to the clients address and ephemeral port so that only traffic for the given five tuple arrives on the new file descriptor -- this allows the server application to keep concurrent sessions with different clients cleanly separated, even though all sessions use the same local server port. In fact, reusing the same port for different sessions is really important from a firewalling perspective,

This is pretty much what our application does, i.e., it uses different sockets/file descriptors to keep sessions straight.

And if it's worth anything, we have been using this mechanism with UDP for a *very* long time, the change in behavior appears to have happened with the 4.5 kernel.

So **thank you**!!

Steve

-----Original Message-----
From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com] 
Sent: Thursday, August 29, 2019 3:27 PM
To: Steve Zabele
Cc: Network Development; shum@canndrew.org; vladimir116@gmail.com; saifi.khan@datasynergy.org; saifi.khan@strikr.in; Daniel Borkmann; on2k16nm@gmail.com; Stephen Hemminger
Subject: Re: Is bug 200755 in anyone's queue??

On Fri, Aug 23, 2019 at 3:11 PM Steve Zabele <zabele@comcast.net> wrote:
>
> Hi folks,
>
> Is there a way to find out where the SO_REUSEPORT bug reported a year ago in
> August (and apparently has been a bug with kernels later than 4.4) is being
> addressed?
>
> The bug characteristics, simple standalone test code demonstrating the bug,
> and an assessment of the likely location/cause of the bug within the kernel
> are all described here
>
> https://bugzilla.kernel.org/show_bug.cgi?id=200755
>
> I'm really hoping this gets fixed so we can move forward on updating our
> kernels/Ubuntu release from our aging 4.4/16.04 release
>
> Thanks!
>
> Steve
>
>
>
> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Tuesday, July 16, 2019 10:03 AM
> To: Steve Zabele
> Cc: shum@canndrew.org; vladimir116@gmail.com; saifi.khan@DataSynergy.org;
> saifi.khan@strikr.in; daniel@iogearbox.net; on2k16nm@gmail.com
> Subject: Re: Is bug 200755 in anyone's queue??
>
> On Tue, 16 Jul 2019 09:43:24 -0400
> "Steve Zabele" <zabele@comcast.net> wrote:
>
>
> > I came across bug report 200755 trying to figure out why some code I had
> > provided to customers a while ago no longer works with the current Linux
> > kernel. See
> >
> > https://bugzilla.kernel.org/show_bug.cgi?id=200755
> >
> > I've verified that, as reported, 'connect' no longer works for UDP.
> > Moreover, it appears it has been broken since the 4.5 kernel has been
> > released.
> >
> >
> >
> > It does also appear that the intended new feature of doing round robin
> > assignments to different UDP sockets opened with SO_REUSEPORT also does
> not
> > work as described.
> >
> >
> >
> > Since the original bug report was made nearly a year ago for the 4.14
> kernel
> > (and the bug is also still present in the 4.15 kernel) I'm curious if
> anyone
> > is on the hook to get this fixed any time soon.
> >
> >
> >
> > I'd rather not have to do my own demultiplexing using a single socket in
> > user space to work around what is clearly a (maybe not so recently
> > introduced) kernel bug if at all possible. My code had worked just fine on
> > 3.X kernels, and appears to work okay up through 4.4.
> >
>
> Kernel developers do not use bugzilla, I forward bug reports
> to netdev@vger.kernel.org (after filtering).

SO_REUSEPORT was not intended to be used in this way. Opening
multiple connected sockets with the same local port.

But since the interface allowed connect after joining a group, and
that is being used, I guess that point is moot. Still, I'm a bit
surprised that it ever worked as described.

Also note that the default distribution algorithm is not round robin
assignment, but hash based. So multiple consecutive datagrams arriving
at the same socket is not unexpected.

I suspect that this quick hack might "work". It seemed to on the
supplied .c file:

                  score = compute_score(sk, net, saddr, sport,
                                        daddr, hnum, dif, sdif);
                  if (score > badness) {
  -                       if (sk->sk_reuseport) {
  +                       if (sk->sk_reuseport && !sk->sk_state !=
TCP_ESTABLISHED) {

But a more robust approach, that also works on existing kernels, is to
swap the default distribution algorithm with a custom BPF based one (
SO_ATTACH_REUSEPORT_EBPF).

^ permalink raw reply

* RE: Is bug 200755 in anyone's queue??
From: Steve Zabele @ 2019-08-30  8:48 UTC (permalink / raw)
  To: 'Willem de Bruijn'
  Cc: 'Network Development', shum, vladimir116, saifi.khan,
	saifi.khan, 'Daniel Borkmann', on2k16nm,
	'Stephen Hemminger', mark.keaton
In-Reply-To: <CA+FuTSdu5inPWp_jkUcFnb-Fs-rdk0AMiieCYtjLE7Qs5oFWZQ@mail.gmail.com>

Hi Willem!

**Thank you** for the reply and the code segment, very much appreciated.

Can we expect that this will make its way into a near-term official release of the kernel? Our customers are really not up to patching and rebuilding kernels, plus it "taints" the kernel from a security perspective, and whenever there is a new release of the kernel (you come in one morning and your kernel has been magically upgraded for you because you forgot to disable auto updates) you need to rebuild and hope that the previous patch is still good for the new code, etc, etc.

Getting this onto the main branch as part of the official release cycle will be greatly appreciated!

Note that using an ebpf approach can't solve this problem (we know because we tried for quite a while to make it work, no luck). The key issue is that at the point when the ebpf filter gets the packet buffer reference it is pointing to the start of the UDP portion of the packet, and hence is not able to access the IP source address which is earlier in the buffer. Plus every time a new socket is opened or closed, a new epbf has to be created and inserted -- and there is really no good way to figure out which index is (now) associated with which file descriptor.. 

So thank you and the group for your attention to this.

With respect to your comment

>SO_REUSEPORT was not intended to be used in this way. Opening
>multiple connected sockets with the same local port.

I'd like to offer that there are a number of reliable transport protocols (alternatives to TCP) that use UDP. NORM (IETF RFC 5470) and Google's new QUIC protocol (https://www.ietf.org/blog/whats-happening-quic) are good examples.

Now consider that users of these protocols will want to create servers using these protocols -- a webserver is a good example. In fact Google has one running on QUIC, and many Chrome users don't even know they are using QUIC when they access Google webservers.

With a client-server model, clients contact the server at a well known server address and port. Upon first contact from a new client, the server opens another socket with the same local address and port and "connects" to the clients address and ephemeral port so that only traffic for the given five tuple arrives on the new file descriptor -- this allows the server application to keep concurrent sessions with different clients cleanly separated, even though all sessions use the same local server port. In fact, reusing the same port for different sessions is really important from a firewalling perspective,

This is pretty much what our application does, i.e., it uses different sockets/file descriptors to keep sessions straight.

And if it's worth anything, we have been using this mechanism with UDP for a *very* long time, the change in behavior appears to have happened with the 4.5 kernel.

So **thank you**!!

Steve

-----Original Message-----
From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com] 
Sent: Thursday, August 29, 2019 3:27 PM
To: Steve Zabele
Cc: Network Development; shum@canndrew.org; vladimir116@gmail.com; saifi.khan@datasynergy.org; saifi.khan@strikr.in; Daniel Borkmann; on2k16nm@gmail.com; Stephen Hemminger
Subject: Re: Is bug 200755 in anyone's queue??

On Fri, Aug 23, 2019 at 3:11 PM Steve Zabele <zabele@comcast.net> wrote:
>
> Hi folks,
>
> Is there a way to find out where the SO_REUSEPORT bug reported a year ago in
> August (and apparently has been a bug with kernels later than 4.4) is being
> addressed?
>
> The bug characteristics, simple standalone test code demonstrating the bug,
> and an assessment of the likely location/cause of the bug within the kernel
> are all described here
>
> https://bugzilla.kernel.org/show_bug.cgi?id=200755
>
> I'm really hoping this gets fixed so we can move forward on updating our
> kernels/Ubuntu release from our aging 4.4/16.04 release
>
> Thanks!
>
> Steve
>
>
>
> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Tuesday, July 16, 2019 10:03 AM
> To: Steve Zabele
> Cc: shum@canndrew.org; vladimir116@gmail.com; saifi.khan@DataSynergy.org;
> saifi.khan@strikr.in; daniel@iogearbox.net; on2k16nm@gmail.com
> Subject: Re: Is bug 200755 in anyone's queue??
>
> On Tue, 16 Jul 2019 09:43:24 -0400
> "Steve Zabele" <zabele@comcast.net> wrote:
>
>
> > I came across bug report 200755 trying to figure out why some code I had
> > provided to customers a while ago no longer works with the current Linux
> > kernel. See
> >
> > https://bugzilla.kernel.org/show_bug.cgi?id=200755
> >
> > I've verified that, as reported, 'connect' no longer works for UDP.
> > Moreover, it appears it has been broken since the 4.5 kernel has been
> > released.
> >
> >
> >
> > It does also appear that the intended new feature of doing round robin
> > assignments to different UDP sockets opened with SO_REUSEPORT also does
> not
> > work as described.
> >
> >
> >
> > Since the original bug report was made nearly a year ago for the 4.14
> kernel
> > (and the bug is also still present in the 4.15 kernel) I'm curious if
> anyone
> > is on the hook to get this fixed any time soon.
> >
> >
> >
> > I'd rather not have to do my own demultiplexing using a single socket in
> > user space to work around what is clearly a (maybe not so recently
> > introduced) kernel bug if at all possible. My code had worked just fine on
> > 3.X kernels, and appears to work okay up through 4.4.
> >
>
> Kernel developers do not use bugzilla, I forward bug reports
> to netdev@vger.kernel.org (after filtering).

SO_REUSEPORT was not intended to be used in this way. Opening
multiple connected sockets with the same local port.

But since the interface allowed connect after joining a group, and
that is being used, I guess that point is moot. Still, I'm a bit
surprised that it ever worked as described.

Also note that the default distribution algorithm is not round robin
assignment, but hash based. So multiple consecutive datagrams arriving
at the same socket is not unexpected.

I suspect that this quick hack might "work". It seemed to on the
supplied .c file:

                  score = compute_score(sk, net, saddr, sport,
                                        daddr, hnum, dif, sdif);
                  if (score > badness) {
  -                       if (sk->sk_reuseport) {
  +                       if (sk->sk_reuseport && !sk->sk_state !=
TCP_ESTABLISHED) {

But a more robust approach, that also works on existing kernels, is to
swap the default distribution algorithm with a custom BPF based one (
SO_ATTACH_REUSEPORT_EBPF).

^ permalink raw reply

* linux-next: manual merge of the staging tree with the net-next and usb trees
From: Stephen Rothwell @ 2019-08-30  8:34 UTC (permalink / raw)
  To: Greg KH, David Miller, Networking
  Cc: Linux Next Mailing List, Linux Kernel Mailing List,
	Benjamin Poirier, Valdis Klētnieks, Sasha Levin

[-- Attachment #1: Type: text/plain, Size: 1819 bytes --]

Hi all,

Today's linux-next merge of the staging tree got conflicts in:

  drivers/staging/Kconfig
  drivers/staging/Makefile

between commits:

  955315b0dc8c ("qlge: Move drivers/net/ethernet/qlogic/qlge/ to drivers/staging/qlge/")
  71ed79b0e4be ("USB: Move wusbcore and UWB to staging as it is obsolete")

from the net-next and usb trees and commit:

  c48c9f7ff32b ("staging: exfat: add exfat filesystem code to staging")

from the staging tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/staging/Kconfig
index fc1420f2a949,fbdc33874780..000000000000
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@@ -120,9 -118,6 +118,11 @@@ source "drivers/staging/kpc2000/Kconfig
  
  source "drivers/staging/isdn/Kconfig"
  
 +source "drivers/staging/qlge/Kconfig"
 +
 +source "drivers/staging/wusbcore/Kconfig"
 +source "drivers/staging/uwb/Kconfig"
 +
+ source "drivers/staging/exfat/Kconfig"
+ 
  endif # STAGING
diff --cc drivers/staging/Makefile
index b08ab677e49b,ca13f87b1e1b..000000000000
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@@ -49,7 -49,4 +49,7 @@@ obj-$(CONFIG_XIL_AXIS_FIFO)	+= axis-fif
  obj-$(CONFIG_FIELDBUS_DEV)     += fieldbus/
  obj-$(CONFIG_KPC2000)		+= kpc2000/
  obj-$(CONFIG_ISDN_CAPI)		+= isdn/
 +obj-$(CONFIG_QLGE)		+= qlge/
 +obj-$(CONFIG_UWB)		+= uwb/
 +obj-$(CONFIG_USB_WUSB)		+= wusbcore/
+ obj-$(CONFIG_EXFAT_FS)		+= exfat/

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH 0/1] Fix deadlock problem and make performance better
From: Zhu Yanjun @ 2019-08-30  8:35 UTC (permalink / raw)
  To: yanjun.zhu, netdev, davem, nan.1986san

When running with about 1Gbit/ses for very long time, running ifconfig
and netstat causes dead lock. These symptoms are similar to the
commit 5f6b4e14cada ("net: dsa: User per-cpu 64-bit statistics"). After
replacing network devices statistics with per-cpu 64-bit statistics,
the dead locks disappear even after very long time running with 1Gbit/sec.

Zhu Yanjun (1):
  forcedeth: use per cpu to collect xmit/recv statistics

 drivers/net/ethernet/nvidia/forcedeth.c | 132 +++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 44 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH 1/1] forcedeth: use per cpu to collect xmit/recv statistics
From: Zhu Yanjun @ 2019-08-30  8:35 UTC (permalink / raw)
  To: yanjun.zhu, netdev, davem, nan.1986san
In-Reply-To: <1567154111-23315-1-git-send-email-yanjun.zhu@oracle.com>

When testing with a background iperf pushing 1Gbit/sec traffic and running
both ifconfig and netstat to collect statistics, some deadlocks occurred.

Ifconfig and netstat will call nv_get_stats64 to get software xmit/recv
statistics. In the commit f5d827aece36 ("forcedeth: implement
ndo_get_stats64() API"), the normal tx/rx variables is to collect tx/rx
statistics. The fix is to replace normal tx/rx variables with per
cpu 64-bit variable to collect xmit/recv statistics. The per cpu variable
will avoid deadlocks and provide fast efficient statistics updates.

In nv_probe, the per cpu variable is initialized. In nv_remove, this
per cpu variable is freed.

In xmit/recv process, this per cpu variable will be updated.

In nv_get_stats64, this per cpu variable on each cpu is added up. Then
the driver can get xmit/recv packets statistics.

A test runs for several days with this commit, the deadlocks disappear
and the performance is better.

Tested:
	- iperf SMP x86_64 ->
	Client connecting to 1.1.1.108, TCP port 5001
	TCP window size: 85.0 KByte (default)
	------------------------------------------------------------
	[  3] local 1.1.1.105 port 38888 connected with 1.1.1.108 port 5001
	[ ID] Interval       Transfer     Bandwidth
	[  3]  0.0-10.0 sec  1.10 GBytes   943 Mbits/sec

	ifconfig results:

	enp0s9    Link encap:Ethernet  HWaddr 00:21:28:6f:de:0f
		  inet addr:1.1.1.105  Bcast:0.0.0.0  Mask:255.255.255.0
		  UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
		  RX packets:5774764531 errors:0 dropped:0 overruns:0 frame:0
		  TX packets:633534193 errors:0 dropped:0 overruns:0 carrier:0
		  collisions:0 txqueuelen:1000
		  RX bytes:7646159340904 (7.6 TB) TX bytes:11425340407722 (11.4 TB)

	netstat results:

	Kernel Interface table
	Iface MTU Met RX-OK RX-ERR RX-DRP RX-OVR TX-OK TX-ERR TX-DRP TX-OVR Flg
	...
	enp0s9 1500 0  5774764531 0    0 0      633534193      0      0  0 BMRU
	...

Fixes: f5d827aece36 ("forcedeth: implement ndo_get_stats64() API")
CC: Joe Jin <joe.jin@oracle.com>
CC: JUNXIAO_BI <junxiao.bi@oracle.com>
Reported-and-tested-by: Nan san <nan.1986san@gmail.com>
Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c | 132 +++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index b327b29..ee8bb9d 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -713,6 +713,21 @@ struct nv_skb_map {
 	struct nv_skb_map *next_tx_ctx;
 };
 
+struct nv_txrx_stats {
+	u64 stat_rx_packets;
+	u64 stat_rx_bytes; /* not always available in HW */
+	u64 stat_rx_missed_errors;
+	u64 stat_rx_dropped;
+	u64 stat_tx_packets; /* not always available in HW */
+	u64 stat_tx_bytes;
+	u64 stat_tx_dropped;
+};
+
+#define nv_txrx_stats_inc(member) \
+		__this_cpu_inc(np->txrx_stats->member)
+#define nv_txrx_stats_add(member, count) \
+		__this_cpu_add(np->txrx_stats->member, (count))
+
 /*
  * SMP locking:
  * All hardware access under netdev_priv(dev)->lock, except the performance
@@ -797,10 +812,7 @@ struct fe_priv {
 
 	/* RX software stats */
 	struct u64_stats_sync swstats_rx_syncp;
-	u64 stat_rx_packets;
-	u64 stat_rx_bytes; /* not always available in HW */
-	u64 stat_rx_missed_errors;
-	u64 stat_rx_dropped;
+	struct nv_txrx_stats __percpu *txrx_stats;
 
 	/* media detection workaround.
 	 * Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
@@ -826,9 +838,6 @@ struct fe_priv {
 
 	/* TX software stats */
 	struct u64_stats_sync swstats_tx_syncp;
-	u64 stat_tx_packets; /* not always available in HW */
-	u64 stat_tx_bytes;
-	u64 stat_tx_dropped;
 
 	/* msi/msi-x fields */
 	u32 msi_flags;
@@ -1721,6 +1730,28 @@ static void nv_update_stats(struct net_device *dev)
 	}
 }
 
+static inline void nv_get_stats(int cpu, struct fe_priv *np,
+				struct rtnl_link_stats64 *storage)
+{
+	struct nv_txrx_stats *src = per_cpu_ptr(np->txrx_stats, cpu);
+	unsigned int syncp_start;
+
+	do {
+		syncp_start = u64_stats_fetch_begin_irq(&np->swstats_rx_syncp);
+		storage->rx_packets       += src->stat_rx_packets;
+		storage->rx_bytes         += src->stat_rx_bytes;
+		storage->rx_dropped       += src->stat_rx_dropped;
+		storage->rx_missed_errors += src->stat_rx_missed_errors;
+	} while (u64_stats_fetch_retry_irq(&np->swstats_rx_syncp, syncp_start));
+
+	do {
+		syncp_start = u64_stats_fetch_begin_irq(&np->swstats_tx_syncp);
+		storage->tx_packets += src->stat_tx_packets;
+		storage->tx_bytes   += src->stat_tx_bytes;
+		storage->tx_dropped += src->stat_tx_dropped;
+	} while (u64_stats_fetch_retry_irq(&np->swstats_tx_syncp, syncp_start));
+}
+
 /*
  * nv_get_stats64: dev->ndo_get_stats64 function
  * Get latest stats value from the nic.
@@ -1733,7 +1764,7 @@ nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 	__releases(&netdev_priv(dev)->hwstats_lock)
 {
 	struct fe_priv *np = netdev_priv(dev);
-	unsigned int syncp_start;
+	int cpu;
 
 	/*
 	 * Note: because HW stats are not always available and for
@@ -1746,20 +1777,8 @@ nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
 	 */
 
 	/* software stats */
-	do {
-		syncp_start = u64_stats_fetch_begin_irq(&np->swstats_rx_syncp);
-		storage->rx_packets       = np->stat_rx_packets;
-		storage->rx_bytes         = np->stat_rx_bytes;
-		storage->rx_dropped       = np->stat_rx_dropped;
-		storage->rx_missed_errors = np->stat_rx_missed_errors;
-	} while (u64_stats_fetch_retry_irq(&np->swstats_rx_syncp, syncp_start));
-
-	do {
-		syncp_start = u64_stats_fetch_begin_irq(&np->swstats_tx_syncp);
-		storage->tx_packets = np->stat_tx_packets;
-		storage->tx_bytes   = np->stat_tx_bytes;
-		storage->tx_dropped = np->stat_tx_dropped;
-	} while (u64_stats_fetch_retry_irq(&np->swstats_tx_syncp, syncp_start));
+	for_each_online_cpu(cpu)
+		nv_get_stats(cpu, np, storage);
 
 	/* If the nic supports hw counters then retrieve latest values */
 	if (np->driver_data & DEV_HAS_STATISTICS_V123) {
@@ -1827,7 +1846,7 @@ static int nv_alloc_rx(struct net_device *dev)
 		} else {
 packet_dropped:
 			u64_stats_update_begin(&np->swstats_rx_syncp);
-			np->stat_rx_dropped++;
+			nv_txrx_stats_inc(stat_rx_dropped);
 			u64_stats_update_end(&np->swstats_rx_syncp);
 			return 1;
 		}
@@ -1869,7 +1888,7 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
 		} else {
 packet_dropped:
 			u64_stats_update_begin(&np->swstats_rx_syncp);
-			np->stat_rx_dropped++;
+			nv_txrx_stats_inc(stat_rx_dropped);
 			u64_stats_update_end(&np->swstats_rx_syncp);
 			return 1;
 		}
@@ -2013,7 +2032,7 @@ static void nv_drain_tx(struct net_device *dev)
 		}
 		if (nv_release_txskb(np, &np->tx_skb[i])) {
 			u64_stats_update_begin(&np->swstats_tx_syncp);
-			np->stat_tx_dropped++;
+			nv_txrx_stats_inc(stat_tx_dropped);
 			u64_stats_update_end(&np->swstats_tx_syncp);
 		}
 		np->tx_skb[i].dma = 0;
@@ -2227,7 +2246,7 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			/* on DMA mapping error - drop the packet */
 			dev_kfree_skb_any(skb);
 			u64_stats_update_begin(&np->swstats_tx_syncp);
-			np->stat_tx_dropped++;
+			nv_txrx_stats_inc(stat_tx_dropped);
 			u64_stats_update_end(&np->swstats_tx_syncp);
 			return NETDEV_TX_OK;
 		}
@@ -2273,7 +2292,7 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				dev_kfree_skb_any(skb);
 				np->put_tx_ctx = start_tx_ctx;
 				u64_stats_update_begin(&np->swstats_tx_syncp);
-				np->stat_tx_dropped++;
+				nv_txrx_stats_inc(stat_tx_dropped);
 				u64_stats_update_end(&np->swstats_tx_syncp);
 				return NETDEV_TX_OK;
 			}
@@ -2384,7 +2403,7 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 			/* on DMA mapping error - drop the packet */
 			dev_kfree_skb_any(skb);
 			u64_stats_update_begin(&np->swstats_tx_syncp);
-			np->stat_tx_dropped++;
+			nv_txrx_stats_inc(stat_tx_dropped);
 			u64_stats_update_end(&np->swstats_tx_syncp);
 			return NETDEV_TX_OK;
 		}
@@ -2431,7 +2450,7 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 				dev_kfree_skb_any(skb);
 				np->put_tx_ctx = start_tx_ctx;
 				u64_stats_update_begin(&np->swstats_tx_syncp);
-				np->stat_tx_dropped++;
+				nv_txrx_stats_inc(stat_tx_dropped);
 				u64_stats_update_end(&np->swstats_tx_syncp);
 				return NETDEV_TX_OK;
 			}
@@ -2560,9 +2579,12 @@ static int nv_tx_done(struct net_device *dev, int limit)
 					    && !(flags & NV_TX_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
 				} else {
+					unsigned int len;
+
 					u64_stats_update_begin(&np->swstats_tx_syncp);
-					np->stat_tx_packets++;
-					np->stat_tx_bytes += np->get_tx_ctx->skb->len;
+					nv_txrx_stats_inc(stat_tx_packets);
+					len = np->get_tx_ctx->skb->len;
+					nv_txrx_stats_add(stat_tx_bytes, len);
 					u64_stats_update_end(&np->swstats_tx_syncp);
 				}
 				bytes_compl += np->get_tx_ctx->skb->len;
@@ -2577,9 +2599,12 @@ static int nv_tx_done(struct net_device *dev, int limit)
 					    && !(flags & NV_TX2_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
 				} else {
+					unsigned int len;
+
 					u64_stats_update_begin(&np->swstats_tx_syncp);
-					np->stat_tx_packets++;
-					np->stat_tx_bytes += np->get_tx_ctx->skb->len;
+					nv_txrx_stats_inc(stat_tx_packets);
+					len = np->get_tx_ctx->skb->len;
+					nv_txrx_stats_add(stat_tx_bytes, len);
 					u64_stats_update_end(&np->swstats_tx_syncp);
 				}
 				bytes_compl += np->get_tx_ctx->skb->len;
@@ -2627,9 +2652,12 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
 						nv_legacybackoff_reseed(dev);
 				}
 			} else {
+				unsigned int len;
+
 				u64_stats_update_begin(&np->swstats_tx_syncp);
-				np->stat_tx_packets++;
-				np->stat_tx_bytes += np->get_tx_ctx->skb->len;
+				nv_txrx_stats_inc(stat_tx_packets);
+				len = np->get_tx_ctx->skb->len;
+				nv_txrx_stats_add(stat_tx_bytes, len);
 				u64_stats_update_end(&np->swstats_tx_syncp);
 			}
 
@@ -2806,6 +2834,15 @@ static int nv_getlen(struct net_device *dev, void *packet, int datalen)
 	}
 }
 
+static inline void rx_missing_handler(u32 flags, struct fe_priv *np)
+{
+	if (flags & NV_RX_MISSEDFRAME) {
+		u64_stats_update_begin(&np->swstats_rx_syncp);
+		nv_txrx_stats_inc(stat_rx_missed_errors);
+		u64_stats_update_end(&np->swstats_rx_syncp);
+	}
+}
+
 static int nv_rx_process(struct net_device *dev, int limit)
 {
 	struct fe_priv *np = netdev_priv(dev);
@@ -2848,11 +2885,7 @@ static int nv_rx_process(struct net_device *dev, int limit)
 					}
 					/* the rest are hard errors */
 					else {
-						if (flags & NV_RX_MISSEDFRAME) {
-							u64_stats_update_begin(&np->swstats_rx_syncp);
-							np->stat_rx_missed_errors++;
-							u64_stats_update_end(&np->swstats_rx_syncp);
-						}
+						rx_missing_handler(flags, np);
 						dev_kfree_skb(skb);
 						goto next_pkt;
 					}
@@ -2896,8 +2929,8 @@ static int nv_rx_process(struct net_device *dev, int limit)
 		skb->protocol = eth_type_trans(skb, dev);
 		napi_gro_receive(&np->napi, skb);
 		u64_stats_update_begin(&np->swstats_rx_syncp);
-		np->stat_rx_packets++;
-		np->stat_rx_bytes += len;
+		nv_txrx_stats_inc(stat_rx_packets);
+		nv_txrx_stats_add(stat_rx_bytes, len);
 		u64_stats_update_end(&np->swstats_rx_syncp);
 next_pkt:
 		if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
@@ -2982,8 +3015,8 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 			}
 			napi_gro_receive(&np->napi, skb);
 			u64_stats_update_begin(&np->swstats_rx_syncp);
-			np->stat_rx_packets++;
-			np->stat_rx_bytes += len;
+			nv_txrx_stats_inc(stat_rx_packets);
+			nv_txrx_stats_add(stat_rx_bytes, len);
 			u64_stats_update_end(&np->swstats_rx_syncp);
 		} else {
 			dev_kfree_skb(skb);
@@ -5651,6 +5684,12 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 	SET_NETDEV_DEV(dev, &pci_dev->dev);
 	u64_stats_init(&np->swstats_rx_syncp);
 	u64_stats_init(&np->swstats_tx_syncp);
+	np->txrx_stats = alloc_percpu(struct nv_txrx_stats);
+	if (!np->txrx_stats) {
+		pr_err("np->txrx_stats, alloc memory error.\n");
+		err = -ENOMEM;
+		goto out_alloc_percpu;
+	}
 
 	timer_setup(&np->oom_kick, nv_do_rx_refill, 0);
 	timer_setup(&np->nic_poll, nv_do_nic_poll, 0);
@@ -6060,6 +6099,8 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 out_disable:
 	pci_disable_device(pci_dev);
 out_free:
+	free_percpu(np->txrx_stats);
+out_alloc_percpu:
 	free_netdev(dev);
 out:
 	return err;
@@ -6105,6 +6146,9 @@ static void nv_restore_mac_addr(struct pci_dev *pci_dev)
 static void nv_remove(struct pci_dev *pci_dev)
 {
 	struct net_device *dev = pci_get_drvdata(pci_dev);
+	struct fe_priv *np = netdev_priv(dev);
+
+	free_percpu(np->txrx_stats);
 
 	unregister_netdev(dev);
 
-- 
2.7.4


^ permalink raw reply related

* [patch net-next v2] mlx5: Add missing init_net check in FIB notifier
From: Jiri Pirko @ 2019-08-30  8:25 UTC (permalink / raw)
  To: netdev; +Cc: davem, saeedm, leon, roid, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

Take only FIB events that are happening in init_net into account. No other
namespaces are supported.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- no change, just cced maintainers (fat finger made me avoid them in v1)
---
 drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
index e69766393990..5d20d615663e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -248,6 +248,9 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
 	struct net_device *fib_dev;
 	struct fib_info *fi;
 
+	if (!net_eq(info->net, &init_net))
+		return NOTIFY_DONE;
+
 	if (info->family != AF_INET)
 		return NOTIFY_DONE;
 
-- 
2.21.0


^ permalink raw reply related

* [patch net-next] mlx5: Add missing init_net check in FIB notifier
From: Jiri Pirko @ 2019-08-30  8:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

Take only FIB events that are happening in init_net into account. No other
namespaces are supported.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
index e69766393990..5d20d615663e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -248,6 +248,9 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
 	struct net_device *fib_dev;
 	struct fib_info *fi;
 
+	if (!net_eq(info->net, &init_net))
+		return NOTIFY_DONE;
+
 	if (info->family != AF_INET)
 		return NOTIFY_DONE;
 
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2 net-next 05/15] net: sgi: ioc3-eth: allocate space for desc rings only once
From: Thomas Bogendoerfer @ 2019-08-30  8:09 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190829150504.68a04fe4@cakuba.netronome.com>

On Thu, 29 Aug 2019 15:05:04 -0700
Jakub Kicinski <jakub.kicinski@netronome.com> wrote:

> On Fri, 30 Aug 2019 00:00:58 +0200, Thomas Bogendoerfer wrote:

> > Out of curiosity does kcalloc/kmalloc_array give me the same guarantees about
> > alignment ? rx ring needs to be 4KB aligned, tx ring 16KB aligned.
> 
> I don't think so, actually, I was mostly worried you are passing
> address from get_page() into kfree() here ;) But patch 11 cures that,
> so that's good, too.

I realized that after sending my last mail. I'll fix that in v3 even
it's just a transient bug.

Thomas.

-- 
SUSE Software Solutions Germany GmbH
HRB 247165 (AG München)
Geschäftsführer: Felix Imendörffer

^ permalink raw reply

* Re: [PATCH v3 1/2] net: core: Notify on changes to dev->promiscuity.
From: Jiri Pirko @ 2019-08-30  8:01 UTC (permalink / raw)
  To: David Miller
  Cc: idosch, andrew, horatiu.vultur, alexandre.belloni, UNGLinuxDriver,
	allan.nielsen, ivecera, f.fainelli, netdev, linux-kernel
In-Reply-To: <20190830.003225.292019185488425085.davem@davemloft.net>

Fri, Aug 30, 2019 at 09:32:25AM CEST, davem@davemloft.net wrote:
>From: Jiri Pirko <jiri@resnulli.us>
>Date: Fri, 30 Aug 2019 09:21:33 +0200
>
>> Fri, Aug 30, 2019 at 09:12:23AM CEST, davem@davemloft.net wrote:
>>>From: Jiri Pirko <jiri@resnulli.us>
>>>Date: Fri, 30 Aug 2019 08:36:24 +0200
>>>
>>>> The promiscuity is a way to setup the rx filter. So promics == rx filter
>>>> off. For normal nics, where there is no hw fwd datapath,
>>>> this coincidentally means all received packets go to cpu.
>>>
>>>You cannot convince me that the HW datapath isn't a "rx filter" too, sorry.
>> 
>> If you look at it that way, then we have 2: rx_filter and hw_rx_filter.
>> The point is, those 2 are not one item, that is the point I'm trying to
>> make :/
>
>And you can turn both of them off when I ask for promiscuous mode, that's
>a detail of the device not a semantic issue.

Well, bridge asks for promiscuous mode during enslave -> hw_rx_filter off
When you, want to see all traffic in tcpdump -> rx_filter off

So basically there are 2 flavours of promiscuous mode we have to somehow
distinguish between, so the driver knows what to do.

Nothe that the hw_rx_filter off is not something special to bridge.
There is a usecase for this when no bridge is there, only TC filters for
example.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox