Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next-2.6 1/2] 3c59x: Specify window explicitly for access to windowed registers
From: Ben Hutchings @ 2010-06-23 23:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Chase Douglas, Arne Nordmark
In-Reply-To: <1277337161.26161.14.camel@localhost>

Currently much of the code assumes that a specific window has been
selected, while a few functions save and restore the window.  This
makes it impossible to introduce fine-grained locking.

Make those assumptions explicit by introducing wrapper functions
to set the window and read/write a register.  Use these everywhere
except vortex_interrupt(), vortex_start_xmit() and vortex_rx().
These set the window just once, or not at all in the case of
vortex_rx() as it should always be called from vortex_interrupt().

Cache the current window in struct vortex_private to avoid
unnecessary hardware writes.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Tested-by: Arne Nordmark <nordmark@mech.kth.se> [against 2.6.32]
---
 drivers/net/3c59x.c |  288 +++++++++++++++++++++++++--------------------------
 1 files changed, 140 insertions(+), 148 deletions(-)

diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index d75803e..beddef9 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -435,7 +435,6 @@ MODULE_DEVICE_TABLE(pci, vortex_pci_tbl);
    First the windows.  There are eight register windows, with the command
    and status registers available in each.
    */
-#define EL3WINDOW(win_num) iowrite16(SelectWindow + (win_num), ioaddr + EL3_CMD)
 #define EL3_CMD 0x0e
 #define EL3_STATUS 0x0e
 
@@ -647,8 +646,35 @@ struct vortex_private {
 	u16 io_size;						/* Size of PCI region (for release_region) */
 	spinlock_t lock;					/* Serialise access to device & its vortex_private */
 	struct mii_if_info mii;				/* MII lib hooks/info */
+	int window;					/* Register window */
 };
 
+static void window_set(struct vortex_private *vp, int window)
+{
+	if (window != vp->window) {
+		iowrite16(SelectWindow + window, vp->ioaddr + EL3_CMD);
+		vp->window = window;
+	}
+}
+
+#define DEFINE_WINDOW_IO(size)						\
+static u ## size							\
+window_read ## size(struct vortex_private *vp, int window, int addr)	\
+{									\
+	window_set(vp, window);						\
+	return ioread ## size(vp->ioaddr + addr);			\
+}									\
+static void								\
+window_write ## size(struct vortex_private *vp, u ## size value,	\
+		     int window, int addr)				\
+{									\
+	window_set(vp, window);						\
+	iowrite ## size(value, vp->ioaddr + addr);			\
+}
+DEFINE_WINDOW_IO(8)
+DEFINE_WINDOW_IO(16)
+DEFINE_WINDOW_IO(32)
+
 #ifdef CONFIG_PCI
 #define DEVICE_PCI(dev) (((dev)->bus == &pci_bus_type) ? to_pci_dev((dev)) : NULL)
 #else
@@ -711,7 +737,7 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq,
 static int vortex_up(struct net_device *dev);
 static void vortex_down(struct net_device *dev, int final);
 static int vortex_open(struct net_device *dev);
-static void mdio_sync(void __iomem *ioaddr, int bits);
+static void mdio_sync(struct vortex_private *vp, int bits);
 static int mdio_read(struct net_device *dev, int phy_id, int location);
 static void mdio_write(struct net_device *vp, int phy_id, int location, int value);
 static void vortex_timer(unsigned long arg);
@@ -1119,6 +1145,7 @@ static int __devinit vortex_probe1(struct device *gendev,
 	vp->has_nway = (vci->drv_flags & HAS_NWAY) ? 1 : 0;
 	vp->io_size = vci->io_size;
 	vp->card_idx = card_idx;
+	vp->window = -1;
 
 	/* module list only for Compaq device */
 	if (gendev == NULL) {
@@ -1205,7 +1232,6 @@ static int __devinit vortex_probe1(struct device *gendev,
 	vp->mii.force_media = vp->full_duplex;
 	vp->options = option;
 	/* Read the station address from the EEPROM. */
-	EL3WINDOW(0);
 	{
 		int base;
 
@@ -1218,14 +1244,15 @@ static int __devinit vortex_probe1(struct device *gendev,
 
 		for (i = 0; i < 0x40; i++) {
 			int timer;
-			iowrite16(base + i, ioaddr + Wn0EepromCmd);
+			window_write16(vp, base + i, 0, Wn0EepromCmd);
 			/* Pause for at least 162 us. for the read to take place. */
 			for (timer = 10; timer >= 0; timer--) {
 				udelay(162);
-				if ((ioread16(ioaddr + Wn0EepromCmd) & 0x8000) == 0)
+				if ((window_read16(vp, 0, Wn0EepromCmd) &
+				     0x8000) == 0)
 					break;
 			}
-			eeprom[i] = ioread16(ioaddr + Wn0EepromData);
+			eeprom[i] = window_read16(vp, 0, Wn0EepromData);
 		}
 	}
 	for (i = 0; i < 0x18; i++)
@@ -1250,9 +1277,8 @@ static int __devinit vortex_probe1(struct device *gendev,
 		pr_err("*** EEPROM MAC address is invalid.\n");
 		goto free_ring;	/* With every pack */
 	}
-	EL3WINDOW(2);
 	for (i = 0; i < 6; i++)
-		iowrite8(dev->dev_addr[i], ioaddr + i);
+		window_write8(vp, dev->dev_addr[i], 2, i);
 
 	if (print_info)
 		pr_cont(", IRQ %d\n", dev->irq);
@@ -1261,8 +1287,7 @@ static int __devinit vortex_probe1(struct device *gendev,
 		pr_warning(" *** Warning: IRQ %d is unlikely to work! ***\n",
 			   dev->irq);
 
-	EL3WINDOW(4);
-	step = (ioread8(ioaddr + Wn4_NetDiag) & 0x1e) >> 1;
+	step = (window_read8(vp, 4, Wn4_NetDiag) & 0x1e) >> 1;
 	if (print_info) {
 		pr_info("  product code %02x%02x rev %02x.%d date %02d-%02d-%02d\n",
 			eeprom[6]&0xff, eeprom[6]>>8, eeprom[0x14],
@@ -1285,17 +1310,15 @@ static int __devinit vortex_probe1(struct device *gendev,
 				(unsigned long long)pci_resource_start(pdev, 2),
 				vp->cb_fn_base);
 		}
-		EL3WINDOW(2);
 
-		n = ioread16(ioaddr + Wn2_ResetOptions) & ~0x4010;
+		n = window_read16(vp, 2, Wn2_ResetOptions) & ~0x4010;
 		if (vp->drv_flags & INVERT_LED_PWR)
 			n |= 0x10;
 		if (vp->drv_flags & INVERT_MII_PWR)
 			n |= 0x4000;
-		iowrite16(n, ioaddr + Wn2_ResetOptions);
+		window_write16(vp, n, 2, Wn2_ResetOptions);
 		if (vp->drv_flags & WNO_XCVR_PWR) {
-			EL3WINDOW(0);
-			iowrite16(0x0800, ioaddr);
+			window_write16(vp, 0x0800, 0, 0);
 		}
 	}
 
@@ -1313,14 +1336,13 @@ static int __devinit vortex_probe1(struct device *gendev,
 	{
 		static const char * const ram_split[] = {"5:3", "3:1", "1:1", "3:5"};
 		unsigned int config;
-		EL3WINDOW(3);
-		vp->available_media = ioread16(ioaddr + Wn3_Options);
+		vp->available_media = window_read16(vp, 3, Wn3_Options);
 		if ((vp->available_media & 0xff) == 0)		/* Broken 3c916 */
 			vp->available_media = 0x40;
-		config = ioread32(ioaddr + Wn3_Config);
+		config = window_read32(vp, 3, Wn3_Config);
 		if (print_info) {
 			pr_debug("  Internal config register is %4.4x, transceivers %#x.\n",
-				config, ioread16(ioaddr + Wn3_Options));
+				config, window_read16(vp, 3, Wn3_Options));
 			pr_info("  %dK %s-wide RAM %s Rx:Tx split, %s%s interface.\n",
 				   8 << RAM_SIZE(config),
 				   RAM_WIDTH(config) ? "word" : "byte",
@@ -1346,7 +1368,6 @@ static int __devinit vortex_probe1(struct device *gendev,
 	if ((vp->available_media & 0x40) || (vci->drv_flags & HAS_NWAY) ||
 		dev->if_port == XCVR_MII || dev->if_port == XCVR_NWAY) {
 		int phy, phy_idx = 0;
-		EL3WINDOW(4);
 		mii_preamble_required++;
 		if (vp->drv_flags & EXTRA_PREAMBLE)
 			mii_preamble_required++;
@@ -1478,18 +1499,17 @@ static void
 vortex_set_duplex(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 
 	pr_info("%s:  setting %s-duplex.\n",
 		dev->name, (vp->full_duplex) ? "full" : "half");
 
-	EL3WINDOW(3);
 	/* Set the full-duplex bit. */
-	iowrite16(((vp->info1 & 0x8000) || vp->full_duplex ? 0x20 : 0) |
-		 	(vp->large_frames ? 0x40 : 0) |
-			((vp->full_duplex && vp->flow_ctrl && vp->partner_flow_ctrl) ?
-					0x100 : 0),
-			ioaddr + Wn3_MAC_Ctrl);
+	window_write16(vp,
+		       ((vp->info1 & 0x8000) || vp->full_duplex ? 0x20 : 0) |
+		       (vp->large_frames ? 0x40 : 0) |
+		       ((vp->full_duplex && vp->flow_ctrl && vp->partner_flow_ctrl) ?
+			0x100 : 0),
+		       3, Wn3_MAC_Ctrl);
 }
 
 static void vortex_check_media(struct net_device *dev, unsigned int init)
@@ -1529,8 +1549,7 @@ vortex_up(struct net_device *dev)
 	}
 
 	/* Before initializing select the active media port. */
-	EL3WINDOW(3);
-	config = ioread32(ioaddr + Wn3_Config);
+	config = window_read32(vp, 3, Wn3_Config);
 
 	if (vp->media_override != 7) {
 		pr_info("%s: Media override to transceiver %d (%s).\n",
@@ -1577,10 +1596,9 @@ vortex_up(struct net_device *dev)
 	config = BFINS(config, dev->if_port, 20, 4);
 	if (vortex_debug > 6)
 		pr_debug("vortex_up(): writing 0x%x to InternalConfig\n", config);
-	iowrite32(config, ioaddr + Wn3_Config);
+	window_write32(vp, config, 3, Wn3_Config);
 
 	if (dev->if_port == XCVR_MII || dev->if_port == XCVR_NWAY) {
-		EL3WINDOW(4);
 		mii_reg1 = mdio_read(dev, vp->phys[0], MII_BMSR);
 		mii_reg5 = mdio_read(dev, vp->phys[0], MII_LPA);
 		vp->partner_flow_ctrl = ((mii_reg5 & 0x0400) != 0);
@@ -1601,51 +1619,46 @@ vortex_up(struct net_device *dev)
 	iowrite16(SetStatusEnb | 0x00, ioaddr + EL3_CMD);
 
 	if (vortex_debug > 1) {
-		EL3WINDOW(4);
 		pr_debug("%s: vortex_up() irq %d media status %4.4x.\n",
-			   dev->name, dev->irq, ioread16(ioaddr + Wn4_Media));
+			   dev->name, dev->irq, window_read16(vp, 4, Wn4_Media));
 	}
 
 	/* Set the station address and mask in window 2 each time opened. */
-	EL3WINDOW(2);
 	for (i = 0; i < 6; i++)
-		iowrite8(dev->dev_addr[i], ioaddr + i);
+		window_write8(vp, dev->dev_addr[i], 2, i);
 	for (; i < 12; i+=2)
-		iowrite16(0, ioaddr + i);
+		window_write16(vp, 0, 2, i);
 
 	if (vp->cb_fn_base) {
-		unsigned short n = ioread16(ioaddr + Wn2_ResetOptions) & ~0x4010;
+		unsigned short n = window_read16(vp, 2, Wn2_ResetOptions) & ~0x4010;
 		if (vp->drv_flags & INVERT_LED_PWR)
 			n |= 0x10;
 		if (vp->drv_flags & INVERT_MII_PWR)
 			n |= 0x4000;
-		iowrite16(n, ioaddr + Wn2_ResetOptions);
+		window_write16(vp, n, 2, Wn2_ResetOptions);
 	}
 
 	if (dev->if_port == XCVR_10base2)
 		/* Start the thinnet transceiver. We should really wait 50ms...*/
 		iowrite16(StartCoax, ioaddr + EL3_CMD);
 	if (dev->if_port != XCVR_NWAY) {
-		EL3WINDOW(4);
-		iowrite16((ioread16(ioaddr + Wn4_Media) & ~(Media_10TP|Media_SQE)) |
-			 media_tbl[dev->if_port].media_bits, ioaddr + Wn4_Media);
+		window_write16(vp,
+			       (window_read16(vp, 4, Wn4_Media) &
+				~(Media_10TP|Media_SQE)) |
+			       media_tbl[dev->if_port].media_bits,
+			       4, Wn4_Media);
 	}
 
 	/* Switch to the stats window, and clear all stats by reading. */
 	iowrite16(StatsDisable, ioaddr + EL3_CMD);
-	EL3WINDOW(6);
 	for (i = 0; i < 10; i++)
-		ioread8(ioaddr + i);
-	ioread16(ioaddr + 10);
-	ioread16(ioaddr + 12);
+		window_read8(vp, 6, i);
+	window_read16(vp, 6, 10);
+	window_read16(vp, 6, 12);
 	/* New: On the Vortex we must also clear the BadSSD counter. */
-	EL3WINDOW(4);
-	ioread8(ioaddr + 12);
+	window_read8(vp, 4, 12);
 	/* ..and on the Boomerang we enable the extra statistics bits. */
-	iowrite16(0x0040, ioaddr + Wn4_NetDiag);
-
-	/* Switch to register set 7 for normal use. */
-	EL3WINDOW(7);
+	window_write16(vp, 0x0040, 4, Wn4_NetDiag);
 
 	if (vp->full_bus_master_rx) { /* Boomerang bus master. */
 		vp->cur_rx = vp->dirty_rx = 0;
@@ -1763,7 +1776,7 @@ vortex_timer(unsigned long data)
 	void __iomem *ioaddr = vp->ioaddr;
 	int next_tick = 60*HZ;
 	int ok = 0;
-	int media_status, old_window;
+	int media_status;
 
 	if (vortex_debug > 2) {
 		pr_debug("%s: Media selection timer tick happened, %s.\n",
@@ -1772,9 +1785,7 @@ vortex_timer(unsigned long data)
 	}
 
 	disable_irq_lockdep(dev->irq);
-	old_window = ioread16(ioaddr + EL3_CMD) >> 13;
-	EL3WINDOW(4);
-	media_status = ioread16(ioaddr + Wn4_Media);
+	media_status = window_read16(vp, 4, Wn4_Media);
 	switch (dev->if_port) {
 	case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
 		if (media_status & Media_LnkBeat) {
@@ -1830,13 +1841,14 @@ vortex_timer(unsigned long data)
 					   dev->name, media_tbl[dev->if_port].name);
 			next_tick = media_tbl[dev->if_port].wait;
 		}
-		iowrite16((media_status & ~(Media_10TP|Media_SQE)) |
-			 media_tbl[dev->if_port].media_bits, ioaddr + Wn4_Media);
+		window_write16(vp,
+			       (media_status & ~(Media_10TP|Media_SQE)) |
+			       media_tbl[dev->if_port].media_bits,
+			       4, Wn4_Media);
 
-		EL3WINDOW(3);
-		config = ioread32(ioaddr + Wn3_Config);
+		config = window_read32(vp, 3, Wn3_Config);
 		config = BFINS(config, dev->if_port, 20, 4);
-		iowrite32(config, ioaddr + Wn3_Config);
+		window_write32(vp, config, 3, Wn3_Config);
 
 		iowrite16(dev->if_port == XCVR_10base2 ? StartCoax : StopCoax,
 			 ioaddr + EL3_CMD);
@@ -1850,7 +1862,6 @@ leave_media_alone:
 	  pr_debug("%s: Media selection timer finished, %s.\n",
 			 dev->name, media_tbl[dev->if_port].name);
 
-	EL3WINDOW(old_window);
 	enable_irq_lockdep(dev->irq);
 	mod_timer(&vp->timer, RUN_AT(next_tick));
 	if (vp->deferred)
@@ -1865,12 +1876,11 @@ static void vortex_tx_timeout(struct net_device *dev)
 	pr_err("%s: transmit timed out, tx_status %2.2x status %4.4x.\n",
 		   dev->name, ioread8(ioaddr + TxStatus),
 		   ioread16(ioaddr + EL3_STATUS));
-	EL3WINDOW(4);
 	pr_err("  diagnostics: net %04x media %04x dma %08x fifo %04x\n",
-			ioread16(ioaddr + Wn4_NetDiag),
-			ioread16(ioaddr + Wn4_Media),
+			window_read16(vp, 4, Wn4_NetDiag),
+			window_read16(vp, 4, Wn4_Media),
 			ioread32(ioaddr + PktStatus),
-			ioread16(ioaddr + Wn4_FIFODiag));
+			window_read16(vp, 4, Wn4_FIFODiag));
 	/* Slight code bloat to be user friendly. */
 	if ((ioread8(ioaddr + TxStatus) & 0x88) == 0x88)
 		pr_err("%s: Transmitter encountered 16 collisions --"
@@ -1917,9 +1927,6 @@ static void vortex_tx_timeout(struct net_device *dev)
 	/* Issue Tx Enable */
 	iowrite16(TxEnable, ioaddr + EL3_CMD);
 	dev->trans_start = jiffies; /* prevent tx timeout */
-
-	/* Switch to register set 7 for normal use. */
-	EL3WINDOW(7);
 }
 
 /*
@@ -1980,10 +1987,10 @@ vortex_error(struct net_device *dev, int status)
 			ioread16(ioaddr + EL3_STATUS) & StatsFull) {
 			pr_warning("%s: Updating statistics failed, disabling "
 				   "stats as an interrupt source.\n", dev->name);
-			EL3WINDOW(5);
-			iowrite16(SetIntrEnb | (ioread16(ioaddr + 10) & ~StatsFull), ioaddr + EL3_CMD);
+			iowrite16(SetIntrEnb |
+				  (window_read16(vp, 5, 10) & ~StatsFull),
+				  ioaddr + EL3_CMD);
 			vp->intr_enable &= ~StatsFull;
-			EL3WINDOW(7);
 			DoneDidThat++;
 		}
 	}
@@ -1993,8 +2000,7 @@ vortex_error(struct net_device *dev, int status)
 	}
 	if (status & HostError) {
 		u16 fifo_diag;
-		EL3WINDOW(4);
-		fifo_diag = ioread16(ioaddr + Wn4_FIFODiag);
+		fifo_diag = window_read16(vp, 4, Wn4_FIFODiag);
 		pr_err("%s: Host error, FIFO diagnostic register %4.4x.\n",
 			   dev->name, fifo_diag);
 		/* Adapter failure requires Tx/Rx reset and reinit. */
@@ -2043,8 +2049,10 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (vp->bus_master) {
 		/* Set the bus-master controller to transfer the packet. */
 		int len = (skb->len + 3) & ~3;
-		iowrite32(vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len, PCI_DMA_TODEVICE),
-				ioaddr + Wn7_MasterAddr);
+		vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len,
+						PCI_DMA_TODEVICE);
+		window_set(vp, 7);
+		iowrite32(vp->tx_skb_dma, ioaddr + Wn7_MasterAddr);
 		iowrite16(len, ioaddr + Wn7_MasterLen);
 		vp->tx_skb = skb;
 		iowrite16(StartDMADown, ioaddr + EL3_CMD);
@@ -2217,6 +2225,8 @@ vortex_interrupt(int irq, void *dev_id)
 		pr_debug("%s: interrupt, status %4.4x, latency %d ticks.\n",
 			   dev->name, status, ioread8(ioaddr + Timer));
 
+	window_set(vp, 7);
+
 	do {
 		if (vortex_debug > 5)
 				pr_debug("%s: In interrupt loop, status %4.4x.\n",
@@ -2760,54 +2770,46 @@ static struct net_device_stats *vortex_get_stats(struct net_device *dev)
 static void update_stats(void __iomem *ioaddr, struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	int old_window = ioread16(ioaddr + EL3_CMD);
 
-	if (old_window == 0xffff)	/* Chip suspended or ejected. */
-		return;
 	/* Unlike the 3c5x9 we need not turn off stats updates while reading. */
 	/* Switch to the stats window, and read everything. */
-	EL3WINDOW(6);
-	dev->stats.tx_carrier_errors		+= ioread8(ioaddr + 0);
-	dev->stats.tx_heartbeat_errors		+= ioread8(ioaddr + 1);
-	dev->stats.tx_window_errors		+= ioread8(ioaddr + 4);
-	dev->stats.rx_fifo_errors		+= ioread8(ioaddr + 5);
-	dev->stats.tx_packets			+= ioread8(ioaddr + 6);
-	dev->stats.tx_packets			+= (ioread8(ioaddr + 9)&0x30) << 4;
-	/* Rx packets	*/			ioread8(ioaddr + 7);   /* Must read to clear */
+	dev->stats.tx_carrier_errors		+= window_read8(vp, 6, 0);
+	dev->stats.tx_heartbeat_errors		+= window_read8(vp, 6, 1);
+	dev->stats.tx_window_errors		+= window_read8(vp, 6, 4);
+	dev->stats.rx_fifo_errors		+= window_read8(vp, 6, 5);
+	dev->stats.tx_packets			+= window_read8(vp, 6, 6);
+	dev->stats.tx_packets			+= (window_read8(vp, 6, 9) &
+						    0x30) << 4;
+	/* Rx packets	*/			window_read8(vp, 6, 7);   /* Must read to clear */
 	/* Don't bother with register 9, an extension of registers 6&7.
 	   If we do use the 6&7 values the atomic update assumption above
 	   is invalid. */
-	dev->stats.rx_bytes 			+= ioread16(ioaddr + 10);
-	dev->stats.tx_bytes 			+= ioread16(ioaddr + 12);
+	dev->stats.rx_bytes 			+= window_read16(vp, 6, 10);
+	dev->stats.tx_bytes 			+= window_read16(vp, 6, 12);
 	/* Extra stats for get_ethtool_stats() */
-	vp->xstats.tx_multiple_collisions	+= ioread8(ioaddr + 2);
-	vp->xstats.tx_single_collisions         += ioread8(ioaddr + 3);
-	vp->xstats.tx_deferred			+= ioread8(ioaddr + 8);
-	EL3WINDOW(4);
-	vp->xstats.rx_bad_ssd			+= ioread8(ioaddr + 12);
+	vp->xstats.tx_multiple_collisions	+= window_read8(vp, 6, 2);
+	vp->xstats.tx_single_collisions         += window_read8(vp, 6, 3);
+	vp->xstats.tx_deferred			+= window_read8(vp, 6, 8);
+	vp->xstats.rx_bad_ssd			+= window_read8(vp, 4, 12);
 
 	dev->stats.collisions = vp->xstats.tx_multiple_collisions
 		+ vp->xstats.tx_single_collisions
 		+ vp->xstats.tx_max_collisions;
 
 	{
-		u8 up = ioread8(ioaddr + 13);
+		u8 up = window_read8(vp, 4, 13);
 		dev->stats.rx_bytes += (up & 0x0f) << 16;
 		dev->stats.tx_bytes += (up & 0xf0) << 12;
 	}
-
-	EL3WINDOW(old_window >> 13);
 }
 
 static int vortex_nway_reset(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 	unsigned long flags;
 	int rc;
 
 	spin_lock_irqsave(&vp->lock, flags);
-	EL3WINDOW(4);
 	rc = mii_nway_restart(&vp->mii);
 	spin_unlock_irqrestore(&vp->lock, flags);
 	return rc;
@@ -2816,12 +2818,10 @@ static int vortex_nway_reset(struct net_device *dev)
 static int vortex_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 	unsigned long flags;
 	int rc;
 
 	spin_lock_irqsave(&vp->lock, flags);
-	EL3WINDOW(4);
 	rc = mii_ethtool_gset(&vp->mii, cmd);
 	spin_unlock_irqrestore(&vp->lock, flags);
 	return rc;
@@ -2830,12 +2830,10 @@ static int vortex_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 static int vortex_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 	unsigned long flags;
 	int rc;
 
 	spin_lock_irqsave(&vp->lock, flags);
-	EL3WINDOW(4);
 	rc = mii_ethtool_sset(&vp->mii, cmd);
 	spin_unlock_irqrestore(&vp->lock, flags);
 	return rc;
@@ -2930,7 +2928,6 @@ static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	int err;
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 	unsigned long flags;
 	pci_power_t state = 0;
 
@@ -2942,7 +2939,6 @@ static int vortex_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if(state != 0)
 		pci_set_power_state(VORTEX_PCI(vp), PCI_D0);
 	spin_lock_irqsave(&vp->lock, flags);
-	EL3WINDOW(4);
 	err = generic_mii_ioctl(&vp->mii, if_mii(rq), cmd, NULL);
 	spin_unlock_irqrestore(&vp->lock, flags);
 	if(state != 0)
@@ -2985,8 +2981,6 @@ static void set_rx_mode(struct net_device *dev)
 static void set_8021q_mode(struct net_device *dev, int enable)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
-	int old_window = ioread16(ioaddr + EL3_CMD);
 	int mac_ctrl;
 
 	if ((vp->drv_flags&IS_CYCLONE) || (vp->drv_flags&IS_TORNADO)) {
@@ -2997,28 +2991,23 @@ static void set_8021q_mode(struct net_device *dev, int enable)
 		if (enable)
 			max_pkt_size += 4;	/* 802.1Q VLAN tag */
 
-		EL3WINDOW(3);
-		iowrite16(max_pkt_size, ioaddr+Wn3_MaxPktSize);
+		window_write16(vp, max_pkt_size, 3, Wn3_MaxPktSize);
 
 		/* set VlanEtherType to let the hardware checksumming
 		   treat tagged frames correctly */
-		EL3WINDOW(7);
-		iowrite16(VLAN_ETHER_TYPE, ioaddr+Wn7_VlanEtherType);
+		window_write16(vp, VLAN_ETHER_TYPE, 7, Wn7_VlanEtherType);
 	} else {
 		/* on older cards we have to enable large frames */
 
 		vp->large_frames = dev->mtu > 1500 || enable;
 
-		EL3WINDOW(3);
-		mac_ctrl = ioread16(ioaddr+Wn3_MAC_Ctrl);
+		mac_ctrl = window_read16(vp, 3, Wn3_MAC_Ctrl);
 		if (vp->large_frames)
 			mac_ctrl |= 0x40;
 		else
 			mac_ctrl &= ~0x40;
-		iowrite16(mac_ctrl, ioaddr+Wn3_MAC_Ctrl);
+		window_write16(vp, mac_ctrl, 3, Wn3_MAC_Ctrl);
 	}
-
-	EL3WINDOW(old_window);
 }
 #else
 
@@ -3037,7 +3026,10 @@ static void set_8021q_mode(struct net_device *dev, int enable)
 /* The maximum data clock rate is 2.5 Mhz.  The minimum timing is usually
    met by back-to-back PCI I/O cycles, but we insert a delay to avoid
    "overclocking" issues. */
-#define mdio_delay() ioread32(mdio_addr)
+static void mdio_delay(struct vortex_private *vp)
+{
+	window_read32(vp, 4, Wn4_PhysicalMgmt);
+}
 
 #define MDIO_SHIFT_CLK	0x01
 #define MDIO_DIR_WRITE	0x04
@@ -3048,16 +3040,15 @@ static void set_8021q_mode(struct net_device *dev, int enable)
 
 /* Generate the preamble required for initial synchronization and
    a few older transceivers. */
-static void mdio_sync(void __iomem *ioaddr, int bits)
+static void mdio_sync(struct vortex_private *vp, int bits)
 {
-	void __iomem *mdio_addr = ioaddr + Wn4_PhysicalMgmt;
-
 	/* Establish sync by sending at least 32 logic ones. */
 	while (-- bits >= 0) {
-		iowrite16(MDIO_DATA_WRITE1, mdio_addr);
-		mdio_delay();
-		iowrite16(MDIO_DATA_WRITE1 | MDIO_SHIFT_CLK, mdio_addr);
-		mdio_delay();
+		window_write16(vp, MDIO_DATA_WRITE1, 4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
+		window_write16(vp, MDIO_DATA_WRITE1 | MDIO_SHIFT_CLK,
+			       4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
 	}
 }
 
@@ -3065,29 +3056,31 @@ static int mdio_read(struct net_device *dev, int phy_id, int location)
 {
 	int i;
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 	int read_cmd = (0xf6 << 10) | (phy_id << 5) | location;
 	unsigned int retval = 0;
-	void __iomem *mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 
 	if (mii_preamble_required)
-		mdio_sync(ioaddr, 32);
+		mdio_sync(vp, 32);
 
 	/* Shift the read command bits out. */
 	for (i = 14; i >= 0; i--) {
 		int dataval = (read_cmd&(1<<i)) ? MDIO_DATA_WRITE1 : MDIO_DATA_WRITE0;
-		iowrite16(dataval, mdio_addr);
-		mdio_delay();
-		iowrite16(dataval | MDIO_SHIFT_CLK, mdio_addr);
-		mdio_delay();
+		window_write16(vp, dataval, 4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
+		window_write16(vp, dataval | MDIO_SHIFT_CLK,
+			       4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
 	}
 	/* Read the two transition, 16 data, and wire-idle bits. */
 	for (i = 19; i > 0; i--) {
-		iowrite16(MDIO_ENB_IN, mdio_addr);
-		mdio_delay();
-		retval = (retval << 1) | ((ioread16(mdio_addr) & MDIO_DATA_READ) ? 1 : 0);
-		iowrite16(MDIO_ENB_IN | MDIO_SHIFT_CLK, mdio_addr);
-		mdio_delay();
+		window_write16(vp, MDIO_ENB_IN, 4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
+		retval = (retval << 1) |
+			((window_read16(vp, 4, Wn4_PhysicalMgmt) &
+			  MDIO_DATA_READ) ? 1 : 0);
+		window_write16(vp, MDIO_ENB_IN | MDIO_SHIFT_CLK,
+			       4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
 	}
 	return retval & 0x20000 ? 0xffff : retval>>1 & 0xffff;
 }
@@ -3095,28 +3088,28 @@ static int mdio_read(struct net_device *dev, int phy_id, int location)
 static void mdio_write(struct net_device *dev, int phy_id, int location, int value)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	void __iomem *ioaddr = vp->ioaddr;
 	int write_cmd = 0x50020000 | (phy_id << 23) | (location << 18) | value;
-	void __iomem *mdio_addr = ioaddr + Wn4_PhysicalMgmt;
 	int i;
 
 	if (mii_preamble_required)
-		mdio_sync(ioaddr, 32);
+		mdio_sync(vp, 32);
 
 	/* Shift the command bits out. */
 	for (i = 31; i >= 0; i--) {
 		int dataval = (write_cmd&(1<<i)) ? MDIO_DATA_WRITE1 : MDIO_DATA_WRITE0;
-		iowrite16(dataval, mdio_addr);
-		mdio_delay();
-		iowrite16(dataval | MDIO_SHIFT_CLK, mdio_addr);
-		mdio_delay();
+		window_write16(vp, dataval, 4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
+		window_write16(vp, dataval | MDIO_SHIFT_CLK,
+			       4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
 	}
 	/* Leave the interface idle. */
 	for (i = 1; i >= 0; i--) {
-		iowrite16(MDIO_ENB_IN, mdio_addr);
-		mdio_delay();
-		iowrite16(MDIO_ENB_IN | MDIO_SHIFT_CLK, mdio_addr);
-		mdio_delay();
+		window_write16(vp, MDIO_ENB_IN, 4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
+		window_write16(vp, MDIO_ENB_IN | MDIO_SHIFT_CLK,
+			       4, Wn4_PhysicalMgmt);
+		mdio_delay(vp);
 	}
 }
 
@@ -3131,8 +3124,7 @@ static void acpi_set_WOL(struct net_device *dev)
 
 	if (vp->enable_wol) {
 		/* Power up on: 1==Downloaded Filter, 2==Magic Packets, 4==Link Status. */
-		EL3WINDOW(7);
-		iowrite16(2, ioaddr + 0x0c);
+		window_write16(vp, 2, 7, 0x0c);
 		/* The RxFilter must accept the WOL frames. */
 		iowrite16(SetRxFilter|RxStation|RxMulticast|RxBroadcast, ioaddr + EL3_CMD);
 		iowrite16(RxEnable, ioaddr + EL3_CMD);
-- 
1.7.1




^ permalink raw reply related

* [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: Ben Hutchings @ 2010-06-23 23:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Chase Douglas, Arne Nordmark
In-Reply-To: <1277337161.26161.14.camel@localhost>

This avoids scheduling in atomic context and also means that IRQs
will only be deferred for relatively short periods of time.

Previously discussed in:
http://article.gmane.org/gmane.linux.network/155024

Reported-by: Arne Nordmark <nordmark@mech.kth.se>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Tested-by: Arne Nordmark <nordmark@mech.kth.se> [against 2.6.32]
---
 drivers/net/3c59x.c |   66 ++++++++++++++++++++++++++++++---------------------
 1 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index beddef9..f4a3fb1 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -644,9 +644,15 @@ struct vortex_private {
 	u16 deferred;						/* Resend these interrupts when we
 										 * bale from the ISR */
 	u16 io_size;						/* Size of PCI region (for release_region) */
-	spinlock_t lock;					/* Serialise access to device & its vortex_private */
-	struct mii_if_info mii;				/* MII lib hooks/info */
-	int window;					/* Register window */
+
+	/* Serialises access to hardware other than MII and variables below.
+	 * The lock hierarchy is rtnl_lock > lock > mii_lock > window_lock. */
+	spinlock_t lock;
+
+	spinlock_t mii_lock;		/* Serialises access to MII */
+	struct mii_if_info mii;		/* MII lib hooks/info */
+	spinlock_t window_lock;		/* Serialises access to windowed regs */
+	int window;			/* Register window */
 };
 
 static void window_set(struct vortex_private *vp, int window)
@@ -661,15 +667,23 @@ static void window_set(struct vortex_private *vp, int window)
 static u ## size							\
 window_read ## size(struct vortex_private *vp, int window, int addr)	\
 {									\
+	unsigned long flags;						\
+	u ## size ret;							\
+	spin_lock_irqsave(&vp->window_lock, flags);			\
 	window_set(vp, window);						\
-	return ioread ## size(vp->ioaddr + addr);			\
+	ret = ioread ## size(vp->ioaddr + addr);			\
+	spin_unlock_irqrestore(&vp->window_lock, flags);		\
+	return ret;							\
 }									\
 static void								\
 window_write ## size(struct vortex_private *vp, u ## size value,	\
 		     int window, int addr)				\
 {									\
+	unsigned long flags;						\
+	spin_lock_irqsave(&vp->window_lock, flags);			\
 	window_set(vp, window);						\
 	iowrite ## size(value, vp->ioaddr + addr);			\
+	spin_unlock_irqrestore(&vp->window_lock, flags);		\
 }
 DEFINE_WINDOW_IO(8)
 DEFINE_WINDOW_IO(16)
@@ -1784,7 +1798,6 @@ vortex_timer(unsigned long data)
 		pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
 	}
 
-	disable_irq_lockdep(dev->irq);
 	media_status = window_read16(vp, 4, Wn4_Media);
 	switch (dev->if_port) {
 	case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
@@ -1805,10 +1818,7 @@ vortex_timer(unsigned long data)
 	case XCVR_MII: case XCVR_NWAY:
 		{
 			ok = 1;
-			/* Interrupts are already disabled */
-			spin_lock(&vp->lock);
 			vortex_check_media(dev, 0);
-			spin_unlock(&vp->lock);
 		}
 		break;
 	  default:					/* Other media types handled by Tx timeouts. */
@@ -1827,6 +1837,8 @@ vortex_timer(unsigned long data)
 	if (!ok) {
 		unsigned int config;
 
+		spin_lock_irq(&vp->lock);
+
 		do {
 			dev->if_port = media_tbl[dev->if_port].next;
 		} while ( ! (vp->available_media & media_tbl[dev->if_port].mask));
@@ -1855,6 +1867,8 @@ vortex_timer(unsigned long data)
 		if (vortex_debug > 1)
 			pr_debug("wrote 0x%08x to Wn3_Config\n", config);
 		/* AKPM: FIXME: Should reset Rx & Tx here.  P60 of 3c90xc.pdf */
+
+		spin_unlock_irq(&vp->lock);
 	}
 
 leave_media_alone:
@@ -1862,7 +1876,6 @@ leave_media_alone:
 	  pr_debug("%s: Media selection timer finished, %s.\n",
 			 dev->name, media_tbl[dev->if_port].name);
 
-	enable_irq_lockdep(dev->irq);
 	mod_timer(&vp->timer, RUN_AT(next_tick));
 	if (vp->deferred)
 		iowrite16(FakeIntr, ioaddr + EL3_CMD);
@@ -2051,9 +2064,11 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		int len = (skb->len + 3) & ~3;
 		vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len,
 						PCI_DMA_TODEVICE);
+		spin_lock_irq(&vp->window_lock);
 		window_set(vp, 7);
 		iowrite32(vp->tx_skb_dma, ioaddr + Wn7_MasterAddr);
 		iowrite16(len, ioaddr + Wn7_MasterLen);
+		spin_unlock_irq(&vp->window_lock);
 		vp->tx_skb = skb;
 		iowrite16(StartDMADown, ioaddr + EL3_CMD);
 		/* netif_wake_queue() will be called at the DMADone interrupt. */
@@ -2225,6 +2240,7 @@ vortex_interrupt(int irq, void *dev_id)
 		pr_debug("%s: interrupt, status %4.4x, latency %d ticks.\n",
 			   dev->name, status, ioread8(ioaddr + Timer));
 
+	spin_lock(&vp->window_lock);
 	window_set(vp, 7);
 
 	do {
@@ -2285,6 +2301,8 @@ vortex_interrupt(int irq, void *dev_id)
 		iowrite16(AckIntr | IntReq | IntLatch, ioaddr + EL3_CMD);
 	} while ((status = ioread16(ioaddr + EL3_STATUS)) & (IntLatch | RxComplete));
 
+	spin_unlock(&vp->window_lock);
+
 	if (vortex_debug > 4)
 		pr_debug("%s: exiting interrupt, status %4.4x.\n",
 			   dev->name, status);
@@ -2806,37 +2824,22 @@ static void update_stats(void __iomem *ioaddr, struct net_device *dev)
 static int vortex_nway_reset(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	unsigned long flags;
-	int rc;
 
-	spin_lock_irqsave(&vp->lock, flags);
-	rc = mii_nway_restart(&vp->mii);
-	spin_unlock_irqrestore(&vp->lock, flags);
-	return rc;
+	return mii_nway_restart(&vp->mii);
 }
 
 static int vortex_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	unsigned long flags;
-	int rc;
 
-	spin_lock_irqsave(&vp->lock, flags);
-	rc = mii_ethtool_gset(&vp->mii, cmd);
-	spin_unlock_irqrestore(&vp->lock, flags);
-	return rc;
+	return mii_ethtool_gset(&vp->mii, cmd);
 }
 
 static int vortex_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-	unsigned long flags;
-	int rc;
 
-	spin_lock_irqsave(&vp->lock, flags);
-	rc = mii_ethtool_sset(&vp->mii, cmd);
-	spin_unlock_irqrestore(&vp->lock, flags);
-	return rc;
+	return mii_ethtool_sset(&vp->mii, cmd);
 }
 
 static u32 vortex_get_msglevel(struct net_device *dev)
@@ -3059,6 +3062,8 @@ static int mdio_read(struct net_device *dev, int phy_id, int location)
 	int read_cmd = (0xf6 << 10) | (phy_id << 5) | location;
 	unsigned int retval = 0;
 
+	spin_lock_bh(&vp->mii_lock);
+
 	if (mii_preamble_required)
 		mdio_sync(vp, 32);
 
@@ -3082,6 +3087,9 @@ static int mdio_read(struct net_device *dev, int phy_id, int location)
 			       4, Wn4_PhysicalMgmt);
 		mdio_delay(vp);
 	}
+
+	spin_unlock_bh(&vp->mii_lock);
+
 	return retval & 0x20000 ? 0xffff : retval>>1 & 0xffff;
 }
 
@@ -3091,6 +3099,8 @@ static void mdio_write(struct net_device *dev, int phy_id, int location, int val
 	int write_cmd = 0x50020000 | (phy_id << 23) | (location << 18) | value;
 	int i;
 
+	spin_lock_bh(&vp->mii_lock);
+
 	if (mii_preamble_required)
 		mdio_sync(vp, 32);
 
@@ -3111,6 +3121,8 @@ static void mdio_write(struct net_device *dev, int phy_id, int location, int val
 			       4, Wn4_PhysicalMgmt);
 		mdio_delay(vp);
 	}
+
+	spin_unlock_bh(&vp->mii_lock);
 }
 
 /* ACPI: Advanced Configuration and Power Interface. */
-- 
1.7.1



^ permalink raw reply related

* Generic receive offload with igb driver breaks IPv6
From: Sam Cannell @ 2010-06-24  0:24 UTC (permalink / raw)
  To: netdev, e1000g-devel

[-- Attachment #1: Type: text/plain, Size: 3178 bytes --]

Hi,

We've recently been having some speed issues through one of our routers
(running Ubuntu's 2.6.32-22-server kernel)

It seems that fragmented TCPv6 packets entering the router through an
Intel 82575EB interface (igb driver) are reassembled by GRO.  The
resulting packet, being larger than 1500 bytes, is unable to be
forwarded out the outgoing interface.  The router sends back a 'packet
too big' error, which means nothing to the sender because as far as it's
concerned, every packet is smaller than the MTU.

This slows down HTTP across a 100mbps network to around 10kbps -- the
speeds return to normal when gro is disabled on the router's receiving
interface.



For instance:


[Capture from sender]
No.     Time        Source                Destination           Protocol
Info
      1 0.000000    2404:130:80:10::10
2404:130:0:1000:227:eff:fe0b:4918 TCP      [TCP segment of a reassembled
PDU]

Frame 1 (1514 bytes on wire, 1514 bytes captured)
Ethernet II, Src: Xensourc_65:30:fe (00:16:3e:65:30:fe), Dst:
Xensourc_f6:80:47 (00:16:3e:f6:80:47)
Internet Protocol Version 6
Transmission Control Protocol, Src Port: http (80), Dst Port: 51793
(51793), Seq: 1, Ack: 1, Len: 1428

No.     Time        Source                Destination           Protocol
Info
      2 0.000000    2404:130:80:10::10
2404:130:0:1000:227:eff:fe0b:4918 TCP      [TCP segment of a reassembled
PDU]

Frame 2 (1514 bytes on wire, 1514 bytes captured)
Ethernet II, Src: Xensourc_65:30:fe (00:16:3e:65:30:fe), Dst:
Xensourc_f6:80:47 (00:16:3e:f6:80:47)
Internet Protocol Version 6
Transmission Control Protocol, Src Port: http (80), Dst Port: 51793
(51793), Seq: 1429, Ack: 1, Len: 1428

No.     Time        Source                Destination           Protocol
Info
      3 0.000000    2404:130:0:10::1      2404:130:80:10::10    ICMPv6
Too big

Frame 3 (1294 bytes on wire, 1294 bytes captured)
Ethernet II, Src: Xensourc_f6:80:47 (00:16:3e:f6:80:47), Dst:
Xensourc_65:30:fe (00:16:3e:65:30:fe)
Internet Protocol Version 6
Internet Control Message Protocol v6



[Capture from router]
No.     Time        Source                Destination           Protocol
Info
      1 0.000000    2404:130:80:10::10
2404:130:0:1000:227:eff:fe0b:4918 TCP      [TCP segment of a reassembled
PDU]

Frame 1 (2942 bytes on wire, 2942 bytes captured)
Ethernet II, Src: Xensourc_f6:80:46 (00:16:3e:f6:80:46), Dst:
SunMicro_40:43:33 (00:21:28:40:43:33)
Internet Protocol Version 6
Transmission Control Protocol, Src Port: http (80), Dst Port: 51793
(51793), Seq: 1, Ack: 1, Len: 2856

No.     Time        Source                Destination           Protocol
Info
      2 0.000035    2404:130:0:10::1      2404:130:80:10::10    ICMPv6
Too big

Frame 2 (1294 bytes on wire, 1294 bytes captured)
Ethernet II, Src: SunMicro_40:43:33 (00:21:28:40:43:33), Dst:
Xensourc_f6:80:46 (00:16:3e:f6:80:46)
Internet Protocol Version 6
Internet Control Message Protocol v6



Any thoughts?  I've also posted this to both net-dev and e1000-devel as
I'm not sure whether it's driver-specific or something to do with gro
itself.

Thanks,

Sam

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 194 bytes --]

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/5] sfc: Implement message level control
From: David Miller @ 2010-06-24  1:29 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev, linux-net-drivers
In-Reply-To: <1277328607.2101.9.camel@achroite.uk.solarflarecom.com>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 23 Jun 2010 22:30:07 +0100

> @@ -850,6 +866,8 @@ const struct ethtool_ops efx_ethtool_ops = {
>  	.get_drvinfo		= efx_ethtool_get_drvinfo,
>  	.get_regs_len		= efx_ethtool_get_regs_len,
>  	.get_regs		= efx_ethtool_get_regs,
> +	.get_msglevel		= efx_ethtool_get_msglevel,
> +	.set_msglevel		= efx_ethtool_set_msglevel,
>  	.nway_reset		= efx_ethtool_nway_reset,
>  	.get_link		= efx_ethtool_get_link,
>  	.get_eeprom_len		= efx_ethtool_get_eeprom_len,

davem@sunset:~/src/GIT/net-next-2.6$ egrep get_regs_len drivers/net/sfc/ethtool.c
davem@sunset:~/src/GIT/net-next-2.6$ 

I don't know what tree you're patching this against, but neither net-2.6 nor
net-next-2.6 have the context you have here in your patch so these patches
do not apply at all.

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/5] sfc: Implement message level control
From: Ben Hutchings @ 2010-06-24  3:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <20100623.182920.28799851.davem@davemloft.net>

On Wed, 2010-06-23 at 18:29 -0700, David Miller wrote:
> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Wed, 23 Jun 2010 22:30:07 +0100
> 
> > @@ -850,6 +866,8 @@ const struct ethtool_ops efx_ethtool_ops = {
> >  	.get_drvinfo		= efx_ethtool_get_drvinfo,
> >  	.get_regs_len		= efx_ethtool_get_regs_len,
> >  	.get_regs		= efx_ethtool_get_regs,
> > +	.get_msglevel		= efx_ethtool_get_msglevel,
> > +	.set_msglevel		= efx_ethtool_set_msglevel,
> >  	.nway_reset		= efx_ethtool_nway_reset,
> >  	.get_link		= efx_ethtool_get_link,
> >  	.get_eeprom_len		= efx_ethtool_get_eeprom_len,
> 
> davem@sunset:~/src/GIT/net-next-2.6$ egrep get_regs_len drivers/net/sfc/ethtool.c
> davem@sunset:~/src/GIT/net-next-2.6$ 
> 
> I don't know what tree you're patching this against, but neither net-2.6 nor
> net-next-2.6 have the context you have here in your patch so these patches
> do not apply at all.

I'm hoping you're going to apply
<http://patchwork.ozlabs.org/patch/56308> first.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [PATCH] xfrm: check bundle policy existance before dereferencing it
From: Timo Teräs @ 2010-06-24  5:45 UTC (permalink / raw)
  To: netdev, Justin P. Mattock, Eric Dumazet, John W.Linville,
	Linux Kernel Mailing List <l
  Cc: Timo Teräs
In-Reply-To: <4C22805A.3080307@gmail.com>

Fix the bundle validation code to not assume having a valid policy.
When we have multiple transformations for a xfrm policy, the bundle
instance will be a chain of bundles with only the first one having
the policy reference. When policy_genid is bumped it will expire the
first bundle in the chain which is equivalent of expiring the whole
chain.

Reported-bisected-and-tested-by: Justin P. Mattock <justinmattock@gmail.com>
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
---
 net/xfrm/xfrm_policy.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4bf27d9..af1c173 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2300,7 +2300,8 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 			return 0;
 		if (xdst->xfrm_genid != dst->xfrm->genid)
 			return 0;
-		if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
+		if (xdst->num_pols > 0 &&
+		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
 		if (strict && fl &&
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH 1/2] syncookies: do not store rcv_wscale in tcp timestamp
From: Florian Westphal @ 2010-06-24  7:53 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: Florian Westphal, netdev, Ilpo Järvinen
In-Reply-To: <20100623202759.GA3581@nuttenaction>

Hagen Paul Pfeifer <hagen@jauu.net> wrote:
> * Florian Westphal | 2010-06-21 23:48:44 [+0200]:
> 
> >As pointed out by Fernando Gont there is no need to encode rcv_wscale
> >into the cookie.
> >
> >We did not use the restored rcv_wscale anyway; it is recomputed
> >via tcp_select_initial_window().
> 
> I speculate that this behavior was and is not correct. I suppose that their is
> a race between the SYN/ACK where we initial force a particular window scale
> and the next time where we recalculate the window via tcp_select_initial_window().

Yes, but it is not the only one. We also do a route lookup to get the
current window size.

> If the user change net.core.rmem_max or net.ipv4.tcp_rmem in between this
> time, the recalculated window scale (rcv_wscale) can be smaller. But the
> receiver still operates with the initial window scale and can overshot the
> granted window - and bang.

Why "bang"?
Sure, its not nice, but is it such a severe problem that we have to keep
rcv_wscale around in the timestamp?

> There are several solutions: encode rcv_wscale into the syn cookie and don't
> recalculate

I would prefer to avoid having to keep rcv_wscale in the timstamp for the sake of
a sysctl change. At the moment the state eats up 9 bits in the timestamp and I don't
like to grow it (which has to be done iff we want to support more options in the future).

>  or disable window scaling and don't transmit any scaling option
> when SYN cookies are active.

No, I would rather see this patch rejected than that.

^ permalink raw reply

* [RFC PATCH v2 0/5] netdev: show a process of packets
From: Koki Sanagi @ 2010-06-24  8:09 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku

CHANGE-LOG since v1:
    1) fix to enable "perf trace record netdev-times" run with a script.
    2) add some options to "perf trace report netdev-times". A detail is in
       PATCH 5/5.

These patch-set adds tracepoints to show us a process of packets.
Using these tracepoints and existing points, we can get the time when
packet passes through some points in transmit or receive sequence.
For example, this is an output of perf script which is attached by patch 5/5.

79074.756672832sec cpu=1
irq_entry(+0.000000msec,irq=77:eth3)
         |------------softirq_raise(+0.001277msec)
irq_exit (+0.002278msec)     |
                             |
                      softirq_entry(+0.003562msec)
                             |
                             |---netif_receive_skb(+0.006279msec,len=100)
                             |            |
                             |   skb_copy_datagram_iovec(+0.038778msec, 2285:sshd)
                             |
                      napi_poll_exit(+0.017160msec, eth3)
                             |
                      softirq_exit(+0.018248msec)

The above is a receive side. Like this, it can show receive sequence from
interrupt(irq_entry) to application(skb_copy_datagram_iovec). There are 8
tracepoints in this side. All events except for skb_copy_datagram_iovec can be
associated with each other by CPU number. skb_copy_datagram_iovec can be
associated with netif_receive_skb by skbaddr.
This script shows one NET_RX softirq and events related to it. All relative
time bases on first irq_entry which raise NET_RX softirq.

   dev    len      dev_queue_xmit|----------|dev_hard_start_xmit|-----|free_skb
                         |             |                           |
   eth3   114  79044.417123332sec     0.005242msec          0.103843msec
   eth3   114  79044.580090422sec     0.002306msec          0.103632msec
   eth3   114  79044.719078251sec     0.002288msec          0.104093msec

The above is a transmit side. There are three tracepoints in this side.
Point1 is before putting a packet to Qdisc. point2 is after ndo_start_xmit in
dev_hard_start_xmit. It indicates finishing putting a packet to driver.
point3 is in consume_skb and dev_kfree_skb_irq. It indicates freeing a
transmitted packet.
Values of this script are, from left, device name, length of a packet, a time of
point1, an interval time between point1 and point2 and an interval time between
point2 and point3.

These times are useful to analyze a performance or to detect a point where
packet delays. For example,
- NET_RX softirq calling is late.
- Application is late to take a packet.
- It takes much time to put a transmitting packet to driver
  (It may be caused by packed queue)

And also, these tracepoint help us to investigate a network driver's trouble
from memory dump because ftrace records it to memory. And ftrace is so light
even if always trace on. So, in a case investigating a problem which doesn't
reproduce, it is useful.

Thanks,
Koki Sanagi.


^ permalink raw reply

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers
From: Michael S. Tsirkin @ 2010-06-24  8:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <4C02C99D.9070204@kernel.org>

On Sun, May 30, 2010 at 10:25:01PM +0200, Tejun Heo wrote:
> Apply the cpumask and cgroup of the initializing task to the created
> vhost poller.
> 
> Based on Sridhar Samudrala's patch.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>


I wanted to apply this, but modpost fails:
ERROR: "sched_setaffinity" [drivers/vhost/vhost_net.ko] undefined!
ERROR: "sched_getaffinity" [drivers/vhost/vhost_net.ko] undefined!

Did you try building as a module?

> ---
>  drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
>  1 file changed, 31 insertions(+), 5 deletions(-)
> 
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -23,6 +23,7 @@
>  #include <linux/highmem.h>
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
> +#include <linux/cgroup.h>
> 
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
> @@ -176,12 +177,30 @@ repeat:
>  long vhost_dev_init(struct vhost_dev *dev,
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
> -	struct task_struct *poller;
> -	int i;
> +	struct task_struct *poller = NULL;
> +	cpumask_var_t mask;
> +	int i, ret = -ENOMEM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto out;
> 
>  	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> -	if (IS_ERR(poller))
> -		return PTR_ERR(poller);
> +	if (IS_ERR(poller)) {
> +		ret = PTR_ERR(poller);
> +		goto out;
> +	}
> +
> +	ret = sched_getaffinity(current->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = sched_setaffinity(poller->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = cgroup_attach_task_current_cg(poller);
> +	if (ret)
> +		goto out;
> 
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
> @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick, POLLIN, dev);
>  	}
> -	return 0;
> +
> +	wake_up_process(poller);	/* avoid contributing to loadavg */
> +	ret = 0;
> +out:
> +	if (ret)
> +		kthread_stop(poller);
> +	free_cpumask_var(mask);
> +	return ret;
>  }
> 
>  /* Caller should have device mutex */

^ permalink raw reply

* [PATCH] smsc95xx: Add module parameter to override MAC address
From: Sebastien Jan @ 2010-06-24  8:14 UTC (permalink / raw)
  To: Steve Glendinning, netdev; +Cc: linux-omap, Sebastien Jan

Define a new module parameter 'macaddr' to override the MAC address
fetched either from eeprom, or randomly generated.

The expected MAC address shall be in the 01:23:45:67:89:AB format.

Signed-off-by: Sebastien Jan <s-jan@ti.com>
---
 drivers/net/usb/smsc95xx.c |   56 ++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 3135af6..0ba06d9 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -46,6 +46,7 @@
 #define SMSC95XX_INTERNAL_PHY_ID	(1)
 #define SMSC95XX_TX_OVERHEAD		(8)
 #define SMSC95XX_TX_OVERHEAD_CSUM	(12)
+#define MAC_ADDR_LEN			(6)
 
 struct smsc95xx_priv {
 	u32 mac_cr;
@@ -63,6 +64,10 @@ static int turbo_mode = true;
 module_param(turbo_mode, bool, 0644);
 MODULE_PARM_DESC(turbo_mode, "Enable multiple frames per Rx transaction");
 
+static char *macaddr = ":";
+module_param(macaddr, charp, 0);
+MODULE_PARM_DESC(macaddr, "MAC address");
+
 static int smsc95xx_read_reg(struct usbnet *dev, u32 index, u32 *data)
 {
 	u32 *buf = kmalloc(4, GFP_KERNEL);
@@ -637,8 +642,59 @@ static int smsc95xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd)
 	return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL);
 }
 
+/* Check the macaddr module parameter for a MAC address */
+static int smsc95xx_is_macaddr_param(struct usbnet *dev, u8 *dev_mac)
+{
+	int i, j, got_num, num;
+	u8 mtbl[MAC_ADDR_LEN];
+
+	if (macaddr[0] == ':')
+		return 0;
+
+	i = 0;
+	j = 0;
+	num = 0;
+	got_num = 0;
+	while (j < MAC_ADDR_LEN) {
+		if (macaddr[i] && macaddr[i] != ':') {
+			got_num++;
+			if ('0' <= macaddr[i] && macaddr[i] <= '9')
+				num = num * 16 + macaddr[i] - '0';
+			else if ('A' <= macaddr[i] && macaddr[i] <= 'F')
+				num = num * 16 + 10 + macaddr[i] - 'A';
+			else if ('a' <= macaddr[i] && macaddr[i] <= 'f')
+				num = num * 16 + 10 + macaddr[i] - 'a';
+			else
+				break;
+			i++;
+		} else if (got_num == 2) {
+			mtbl[j++] = (u8) num;
+			num = 0;
+			got_num = 0;
+			i++;
+		} else {
+			break;
+		}
+	}
+
+	if (j == MAC_ADDR_LEN && !macaddr[i]) {
+		netif_dbg(dev, ifup, dev->net, "Overriding MAC address with: "
+		"%02x:%02x:%02x:%02x:%02x:%02x\n", mtbl[0], mtbl[1], mtbl[2],
+						mtbl[3], mtbl[4], mtbl[5]);
+		for (i = 0; i < MAC_ADDR_LEN; i++)
+			dev_mac[i] = mtbl[i];
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
 static void smsc95xx_init_mac_address(struct usbnet *dev)
 {
+	/* Check module parameters */
+	if (smsc95xx_is_macaddr_param(dev, dev->net->dev_addr))
+		return;
+
 	/* try reading mac address from EEPROM */
 	if (smsc95xx_read_eeprom(dev, EEPROM_MAC_OFFSET, ETH_ALEN,
 			dev->net->dev_addr) == 0) {
-- 
1.6.3.3


^ permalink raw reply related

* [RFC PATCH v2 1/5] irq: add tracepoint to softirq_raise
From: Koki Sanagi @ 2010-06-24  8:16 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku,
	linux-kernel
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

This patch adds a tracepoint to raising of softirq.
This is useful if you want to detect which hard interrupt raise softirq
and lets you know a time between raising softirq and performing softirq.
Combinating with other tracepoint, it lets us know a process of packets
(See patch 0/5).

          <idle>-0     [001] 241229.957184: softirq_raise: vec=3 [action=NET_RX]
          <idle>-0     [000] 241229.993399: softirq_raise: vec=1 [action=TIMER]
          <idle>-0     [000] 241229.993400: softirq_raise: vec=9 [action=RCU]

This is a same patch Lai Jiangshan submitted.
http://marc.info/?l=linux-kernel&m=126026122728732&w=2

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 include/linux/interrupt.h  |    8 +++++++-
 include/trace/events/irq.h |   34 +++++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c233113..1cb5726 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
+#include <trace/events/irq.h>
 
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
@@ -402,7 +403,12 @@ asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+static inline void __raise_softirq_irqoff(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	or_softirq_pending(1UL << nr);
+}
+
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 extern void wakeup_softirqd(void);
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 0e4cfb6..7cb7435 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -5,7 +5,9 @@
 #define _TRACE_IRQ_H
 
 #include <linux/tracepoint.h>
-#include <linux/interrupt.h>
+
+struct irqaction;
+struct softirq_action;
 
 #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
 #define show_softirq_name(val)				\
@@ -82,6 +84,32 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
+/**
+ * softirq_raise - called immediately when a softirq is raised
+ * @nr: softirq vector number
+ *
+ * Tracepoint for tracing when softirq action is raised.
+ * Also, when used in combination with the softirq_entry tracepoint
+ * we can determine the softirq raise latency.
+ */
+TRACE_EVENT(softirq_raise,
+
+	TP_PROTO(unsigned int nr),
+
+	TP_ARGS(nr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vec	)
+	),
+
+	TP_fast_assign(
+		__entry->vec	= nr;
+	),
+
+	TP_printk("vec=%d [action=%s]", __entry->vec,
+		show_softirq_name(__entry->vec))
+);
+
 DECLARE_EVENT_CLASS(softirq,
 
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
@@ -89,11 +117,11 @@ DECLARE_EVENT_CLASS(softirq,
 	TP_ARGS(h, vec),
 
 	TP_STRUCT__entry(
-		__field(	int,	vec			)
+		__field(	unsigned int,	vec	)
 	),
 
 	TP_fast_assign(
-		__entry->vec = (int)(h - vec);
+		__entry->vec = (unsigned int)(h - vec);
 	),
 
 	TP_printk("vec=%d [action=%s]", __entry->vec,



^ permalink raw reply related

* [RFC PATCH v2 2/5] napi: convert trace_napi_poll to TRACE_EVENT
From: Koki Sanagi @ 2010-06-24  8:17 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

This patch converts trace_napi_poll from DECLARE_EVENT to TRACE_EVENT.
This lets you know how long napi_poll takes.

          <idle>-0     [001] 241302.750777: napi_poll: napi poll on napi struct f6acc480 for device eth3
          <idle>-0     [000] 241302.852389: napi_poll: napi poll on napi struct f5d0d70c for device eth1
          <idle>-0     [000] 241302.852389: napi_poll: napi poll on napi struct f5d0d20c for device eth1

This is a same patch Neil Horman submitted.
http://marc.info/?l=linux-kernel&m=125978157926853&w=2

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 include/trace/events/napi.h |   23 +++++++++++++++++++++--
 1 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index 188deca..512a057 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -6,10 +6,29 @@
 
 #include <linux/netdevice.h>
 #include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+#define NO_DEV "(no_device)"
+
+TRACE_EVENT(napi_poll,
 
-DECLARE_TRACE(napi_poll,
 	TP_PROTO(struct napi_struct *napi),
-	TP_ARGS(napi));
+
+	TP_ARGS(napi),
+
+	TP_STRUCT__entry(
+		__field(	struct napi_struct *,	napi)
+		__string(	dev_name, napi->dev ? napi->dev->name : NO_DEV)
+	),
+
+	TP_fast_assign(
+		__entry->napi = napi;
+		__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+	),
+
+	TP_printk("napi poll on napi struct %p for device %s",
+		__entry->napi, __get_str(dev_name))
+);
 
 #endif /* _TRACE_NAPI_H_ */
 


^ permalink raw reply related

* [RFC PATCH v2 3/5] netdev: add tracepoints to netdev layer
From: Koki Sanagi @ 2010-06-24  8:18 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

This patch adds tracepoint to dev_queue_xmit, dev_hard_start_xmit and
netif_receive_skb. These tracepints help you to monitor network driver's
input/output.

            sshd-4445  [001] 241367.066046: net_dev_queue: dev=eth3 skbaddr=dd6b2538 len=114
            sshd-4445  [001] 241367.066047: net_dev_xmit: dev=eth3 skbaddr=dd6b2538 len=114 rc=0
          <idle>-0     [001] 241367.067472: net_dev_receive: dev=eth3 skbaddr=f5e59000 len=52

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 include/trace/events/net.h |   83 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c             |    5 +++
 net/core/net-traces.c      |    1 +
 3 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
new file mode 100644
index 0000000..ee10970
--- /dev/null
+++ b/include/trace/events/net.h
@@ -0,0 +1,83 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM net
+
+#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(net_dev_xmit,
+
+	TP_PROTO(struct sk_buff *skb,
+		 int rc),
+
+	TP_ARGS(skb, rc),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__field(	int,		rc		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__entry->rc = rc;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+);
+
+TRACE_EVENT(net_dev_queue,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+);
+
+TRACE_EVENT(net_dev_receive,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+);
+#endif /* _TRACE_NET_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/core/dev.c b/net/core/dev.c
index 5902426..4b64b21 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/events/napi.h>
+#include <trace/events/net.h>
 #include <linux/pci.h>
 
 #include "net-sysfs.h"
@@ -1922,6 +1923,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		rc = ops->ndo_start_xmit(skb, dev);
+		trace_net_dev_xmit(skb, rc);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		return rc;
@@ -1942,6 +1944,7 @@ gso:
 			skb_dst_drop(nskb);
 
 		rc = ops->ndo_start_xmit(nskb, dev);
+		trace_net_dev_xmit(nskb, rc);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
 				goto out_kfree_gso_skb;
@@ -2156,6 +2159,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	}
 
 gso:
+	trace_net_dev_queue(skb);
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
 	 */
@@ -2942,6 +2946,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	trace_net_dev_receive(skb);
 #ifdef CONFIG_RPS
 	{
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380..7f1bb2a 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/skb.h>
+#include <trace/events/net.h>
 #include <trace/events/napi.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);


^ permalink raw reply related

* [RFC PATCH v2 4/5] skb: add tracepoints to freeing skb
From: Koki Sanagi @ 2010-06-24  8:19 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

 This patch adds tracepoint to consume_skb and dev_kfree_skb_irq.
Combinating with tracepoint on dev_hard_start_xmit, we can check how long it
takes to free transmited packets. And using it, we can calculate how many
packets driver had at that time. It is useful when a drop of transmited packet
is a problem.

          <idle>-0     [001] 241409.218333: consume_skb: skbaddr=dd6b2fb8
          <idle>-0     [001] 241409.490555: dev_kfree_skb_irq: skbaddr=f5e29840

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
include/trace/events/skb.h |   36 ++++++++++++++++++++++++++++++++++++
 net/core/dev.c             |    2 ++
 net/core/skbuff.c          |    1 +
 3 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 4b2be6d..6ab5b34 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -35,6 +35,42 @@ TRACE_EVENT(kfree_skb,
 		__entry->skbaddr, __entry->protocol, __entry->location)
 );
 
+TRACE_EVENT(consume_skb,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,	skbaddr	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+	),
+
+	TP_printk("skbaddr=%p",
+		__entry->skbaddr)
+);
+
+TRACE_EVENT(dev_kfree_skb_irq,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,	skbaddr	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+	),
+
+	TP_printk("skbaddr=%p",
+		__entry->skbaddr)
+);
+
 TRACE_EVENT(skb_copy_datagram_iovec,
 
 	TP_PROTO(const struct sk_buff *skb, int len),
diff --git a/net/core/dev.c b/net/core/dev.c
index 4b64b21..807b1ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -131,6 +131,7 @@
 #include <linux/random.h>
 #include <trace/events/napi.h>
 #include <trace/events/net.h>
+#include <trace/events/skb.h>
 #include <linux/pci.h>
 
 #include "net-sysfs.h"
@@ -1580,6 +1581,7 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
 		struct softnet_data *sd;
 		unsigned long flags;
 
+		trace_dev_kfree_skb_irq(skb);
 		local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
 		skb->next = sd->completion_queue;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 34432b4..a7b4036 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb)
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
+	trace_consume_skb(skb);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(consume_skb);


^ permalink raw reply related

* [RFC PATCH v2 5/5] perf:add a script shows a process of packet
From: Koki Sanagi @ 2010-06-24  8:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

This perf script shows a time-chart of process of packet.
Patch 0/5 shows an output of this.
If you want to use it, install perf and record perf.data like following.

#perf trace record netdev-times [script]

If you set script, perf gathers records until it ends.
If not, you must Ctrl-C to stop recording.

And if you want a report from record,

#perf trace report netdev-times [options]

If you use some options, you can limit an output.
Option is below.

tx: show only process of tx packets
rx: show only process of rx packets
dev=: show a process specified with this option

In the future, I want src/dst IP(v6) address filter option. 
It's now under consideration/construction.

For example, if you want to show a process of received packets associated
with eth3,

#perf trace report netdev-times rx dev=eth3
79074.756672832sec cpu=1
irq_entry(+0.000000msec,irq=77:eth3)
         |------------softirq_raise(+0.001277msec)
irq_exit (+0.002278msec)     |
                             |
                      softirq_entry(+0.003562msec)
                             |
                             |---netif_receive_skb(+0.006279msec,len=100)
                             |            |
                             |   skb_copy_datagram_iovec(+0.038778msec, 2285:sshd)
                             |
                      napi_poll_exit(+0.017160msec, eth3)
                             |
                      softirq_exit(+0.018248msec)


This perf script helps us to analyze a process time of transmit/receive
sequence.

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 tools/perf/scripts/python/bin/netdev-times-record |    7 +
 tools/perf/scripts/python/bin/netdev-times-report |    5 +
 tools/perf/scripts/python/netdev-times.py         |  495 +++++++++++++++++++++
 3 files changed, 507 insertions(+), 0 deletions(-)

diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record
new file mode 100644
index 0000000..1dfa8d5
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,7 @@
+#!/bin/bash
+perf record -c 1 -f -R -a -e net:net_dev_xmit -e net:net_dev_queue	\
+		-e net:net_dev_receive -e skb:consume_skb		\
+		-e skb:dev_kfree_skb_irq -e napi:napi_poll		\
+		-e irq:irq_handler_entry -e irq:irq_handler_exit	\
+		-e irq:softirq_entry -e irq:softirq_exit		\
+		-e irq:softirq_raise -e skb:skb_copy_datagram_iovec $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report
new file mode 100644
index 0000000..ecc8122
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: displayi a process of packet and processing time
+# args: tx rx dev src dst
+
+perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
new file mode 100644
index 0000000..5e68be4
--- /dev/null
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -0,0 +1,495 @@
+# Display process of packets and processed time.
+# It helps you to investigate networking.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+all_event_list = []; # insert all tracepoint event related with this script
+irq_dic = {}; # key is cpu and value is a list which stacks irqs
+              # which raise NET_RX softirq
+net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
+		 # and a list which stacks receive
+receive_hunk_list = []; # a list which include a sequence of receive events
+receive_skb_list = []; # received packet list for matching
+		       # skb_copy_datagram_iovec
+
+queue_list = []; # list of packets which pass through dev_queue_xmit
+xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
+free_list = [];  # list of packets which is freed
+
+show_tx = 0;
+show_rx = 0;
+dev = 0; # store a name of device specified by option "dev="
+
+# Calculate a time interval(msec) from src(nsec) to dst(nsec)
+def diff_msec(src, dst):
+	return (dst - src) / 1000000.0
+
+# Display a process of transmitting a packet
+def print_transmit(hunk):
+	if dev != 0 and hunk['dev'].find(dev) < 0:
+		return
+	print "%7s %5d %6d.%09dsec %12.6fmsec      %12.6fmsec" % \
+		(hunk['dev'], hunk['len'],
+		nsecs_secs(hunk['queue_t']),
+		nsecs_nsecs(hunk['queue_t']),
+		diff_msec(hunk['queue_t'], hunk['xmit_t']),
+		diff_msec(hunk['xmit_t'], hunk['free_t']))
+
+# Display a process of received packets and interrputs associated with
+# a NET_RX softirq
+def print_receive(hunk):
+	show_hunk = 0
+	if 'irq_list' not in hunk.keys() \
+	or len(hunk['irq_list']) == 0:
+		return
+	irq_list = hunk['irq_list']
+	cpu = irq_list[0]['cpu']
+	base_t = irq_list[0]['irq_ent_t']
+	# check if this hunk should be showed
+	if dev != 0:
+		for i in range(len(irq_list)):
+			if irq_list[i]['name'].find(dev) >= 0:
+				show_hunk = 1
+				break
+	else:
+		show_hunk = 1
+	if show_hunk == 0:
+		return
+
+	print "%d.%09dsec cpu=%d" % \
+		(nsecs_secs(base_t), nsecs_nsecs(base_t), cpu)
+	for i in range(len(irq_list)):
+		print "irq_entry(+%fmsec,irq=%d:%s)" % \
+			(diff_msec(base_t, irq_list[i]['irq_ent_t']),
+			irq_list[i]['irq'], irq_list[i]['name'])
+
+		if 'sirq_raise_t' in irq_list[i].keys():
+			print "         |------------" \
+			      "softirq_raise(+%fmsec)" % \
+				diff_msec(base_t, irq_list[i]['sirq_raise_t'])
+
+		if 'irq_ext_t' in irq_list[i].keys():
+			print "irq_exit (+%fmsec)     |" % \
+				diff_msec(base_t, irq_list[i]['irq_ext_t'])
+
+		print "                             |"
+
+	if 'sirq_ent_t' not in hunk.keys():
+		print 'maybe softirq_entry is dropped'
+		return
+	print "                      " \
+		"softirq_entry(+%fmsec)\n" \
+		"                      " \
+		"       |" % \
+		diff_msec(base_t, hunk['sirq_ent_t'])
+	event_list = hunk['event_list']
+	for i in range(len(event_list)):
+		event = event_list[i]
+		if event['event_name'] == 'napi_poll':
+			print "                      " \
+			      "napi_poll_exit(+%fmsec, %s)" % \
+			(diff_msec(base_t, event['event_t']), event['dev'])
+			print "                      " \
+			      "       |"
+		elif 'comm' in event.keys():
+			print "                      " \
+				"       |---netif_receive_skb" \
+				"(+%fmsec,len=%d)\n" \
+				"                      " \
+				"       |            |\n" \
+				"                      " \
+				"       |   skb_copy_datagram_iovec" \
+				"(+%fmsec, %d:%s)\n" \
+				"                      " \
+				"       |" % \
+			(diff_msec(base_t, event['event_t']),
+			event['len'],
+			diff_msec(base_t, event['comm_t']),
+			event['pid'], event['comm'])
+		else:
+			print "                      " \
+				"       |---netif_receive_skb" \
+				"(+%fmsec,len=%d)\n" \
+				"                      " \
+				"       |" % \
+				(diff_msec(base_t, event['event_t']),
+					event['len'])
+
+	print "                      " \
+	      "softirq_exit(+%fmsec)\n" % \
+		 diff_msec(base_t, hunk['sirq_ext_t'])
+
+def trace_begin():
+	global show_tx
+	global show_rx
+	global dev
+
+	for i in range(len(sys.argv)):
+		if i == 0:
+			continue
+		arg = sys.argv[i]
+		if arg == 'tx':
+			show_tx = 1
+		elif arg =='rx':
+			show_rx = 1
+		elif arg.find('dev=',0, 4) >= 0:
+			dev = arg[4:]
+	if show_tx == 0  and show_rx == 0:
+		show_tx = 1
+		show_rx = 1
+
+def trace_end():
+	global show_tx
+	global show_rx
+	# order all events in time
+	all_event_list.sort(lambda a,b :cmp(a['time'], b['time']))
+	# process all events
+	for i in range(len(all_event_list)):
+		event = all_event_list[i]
+		event_name = event['event_name']
+		if event_name == 'irq__softirq_exit':
+			handle_irq_softirq_exit(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['vec'])
+		elif event_name == 'irq__softirq_entry':
+			handle_irq_softirq_entry(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'],event['vec'])
+		elif event_name == 'irq__softirq_raise':
+			handle_irq_softirq_raise(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['vec'])
+		elif event_name == 'irq__irq_handler_entry':
+			handle_irq_handler_entry(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['irq'], event['name'])
+		elif event_name == 'irq__irq_handler_exit':
+			handle_irq_handler_exit(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['irq'], event['ret'])
+		elif event_name == 'napi__napi_poll':
+			handle_napi_poll(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['napi'],
+				event['dev_name'])
+		elif event_name == 'net__net_dev_receive':
+			handle_net_dev_receive(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'], event['name'])
+		elif event_name == 'skb__skb_copy_datagram_iovec':
+			handle_skb_copy_datagram_iovec(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'])
+		elif event_name == 'net__net_dev_queue':
+			handle_net_dev_queue(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'], event['name'])
+		elif event_name == 'net__net_dev_xmit':
+			handle_net_dev_xmit(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'], event['rc'], event['name'])
+		elif event_name == 'skb__dev_kfree_skb_irq':
+			handle_dev_kfree_skb_irq(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'])
+		elif event_name == 'skb__consume_skb':
+			handle_consume_skb(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'])
+	# display receive hunks
+	if show_rx == 1:
+		for i in range(len(receive_hunk_list)):
+			print_receive(receive_hunk_list[i])
+	# display transmit hunks
+	if show_tx == 1:
+		print "   dev    len      dev_queue_xmit|----------|" \
+			"dev_hard_start_xmit|-----|free_skb"
+		print "                         |             |" \
+			"                           |"
+		for i in range(len(free_list)):
+			print_transmit(free_list[i])
+
+def irq__softirq_exit(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'vec':vec}
+	all_event_list.append(event_data)
+
+def handle_irq_softirq_exit(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	vec):
+	rec_data = {'sirq_ext_t':time}
+	if common_cpu in irq_dic.keys():
+		rec_data.update({'irq_list':irq_dic[common_cpu]})
+		del irq_dic[common_cpu]
+	if common_cpu in net_rx_dic.keys():
+		rec_data.update({
+		    'event_list':net_rx_dic[common_cpu]['event_list'],
+		    'sirq_ent_t':net_rx_dic[common_cpu]['sirq_ent_t']})
+		del net_rx_dic[common_cpu]
+	# merge information realted to a NET_RX softirq
+	receive_hunk_list.append(rec_data)
+
+def irq__softirq_entry(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'vec':vec}
+	all_event_list.append(event_data)
+
+def handle_irq_softirq_entry(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	vec):
+		net_rx_dic[common_cpu] = {'event_list':[],
+					  'sirq_ent_t':time}
+
+def irq__softirq_raise(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'vec':vec}
+	all_event_list.append(event_data)
+
+def handle_irq_softirq_raise(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	vec):
+	if common_cpu not in irq_dic.keys() \
+	or len(irq_dic[common_cpu]) == 0:
+		return
+	irq = irq_dic[common_cpu].pop()
+	# put a time to prev irq on the same cpu
+	irq.update({'sirq_raise_t':time})
+	irq_dic[common_cpu].append(irq)
+
+def irq__irq_handler_entry(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	irq, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'irq':irq, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_irq_handler_entry(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	irq, name):
+	if common_cpu not in irq_dic.keys():
+		irq_dic[common_cpu] = []
+	irq_record = {'irq':irq,
+		      'name':name,
+		      'cpu':common_cpu,
+		      'irq_ent_t':time}
+	irq_dic[common_cpu].append(irq_record)
+
+def irq__irq_handler_exit(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	irq, ret):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'irq':irq, 'ret':ret}
+	all_event_list.append(event_data)
+
+def handle_irq_handler_exit(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	irq, ret):
+	if common_cpu not in irq_dic.keys():
+		return
+	irq_record = irq_dic[common_cpu].pop()
+	irq_record.update({'irq_ext_t':time})
+	# if an irq doesn't include NET_RX softirq, drop.
+	if 'sirq_raise_t' in irq_record.keys():
+		irq_dic[common_cpu].append(irq_record)
+
+def napi__napi_poll(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	napi, dev_name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'napi':napi, 'dev_name':dev_name}
+	all_event_list.append(event_data)
+
+def handle_napi_poll(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	napi, dev_name):
+	if common_cpu in net_rx_dic.keys():
+		event_list = net_rx_dic[common_cpu]['event_list']
+		rec_data = {'event_name':'napi_poll',
+			    'dev':dev_name,
+			    'event_t':time}
+		event_list.append(rec_data)
+
+def net__net_dev_receive(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr,skblen, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_net_dev_receive(event_name, context, common_cpu,
+	ccommon_pid, common_comm, time,
+	skbaddr, skblen, name):
+	if common_cpu in net_rx_dic.keys():
+		rec_data = {'event_name':'netif_receive_skb',
+			    'event_t':time,
+			    'skbaddr':skbaddr,
+			    'len':skblen}
+		event_list = net_rx_dic[common_cpu]['event_list']
+		event_list.append(rec_data)
+		receive_skb_list.insert(0, rec_data)
+
+def skb__skb_copy_datagram_iovec(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr, skblen):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen}
+	all_event_list.append(event_data)
+
+def handle_skb_copy_datagram_iovec(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr, skblen):
+	for i in range(len(receive_skb_list)):
+		rec_data = receive_skb_list[i]
+		if skbaddr == rec_data['skbaddr'] and \
+			'comm' not in rec_data.keys():
+			rec_data.update({'comm':common_comm,
+					 'pid':common_pid,
+					 'comm_t':time})
+			del receive_skb_list[i]
+			break
+
+def net__net_dev_queue(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr, skblen, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_net_dev_queue(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr, skblen, name):
+	skb = {'dev':name,
+	       'skbaddr':skbaddr,
+	       'len':skblen,
+	       'queue_t':time}
+	xmit_list.insert(0, skb)
+
+def net__net_dev_xmit(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr, skblen, rc, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen, 'rc':rc, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_net_dev_xmit(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr, skblen, rc, name):
+	if rc == 0: # NETDEV_TX_OK
+		for i in range(len(xmit_list)):
+			skb = xmit_list[i]
+			if skb['skbaddr'] == skbaddr:
+				skb['xmit_t'] = time
+				queue_list.insert(0, skb)
+				del xmit_list[i]
+				break
+
+def free_skb(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr):
+	for i in range(len(queue_list)):
+		skb = queue_list[i]
+		if skb['skbaddr'] ==skbaddr:
+			skb['free_t'] = time
+			free_list.append(skb)
+			del queue_list[i]
+			break
+
+def skb__dev_kfree_skb_irq(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr}
+	all_event_list.append(event_data)
+
+def handle_dev_kfree_skb_irq(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr):
+	free_skb(event_name, context, common_cpu,
+		common_pid, common_comm, time,
+		skbaddr)
+
+def skb__consume_skb(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr}
+	all_event_list.append(event_data)
+
+def handle_consume_skb(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr):
+	free_skb(event_name, context, common_cpu,
+		common_pid, common_comm, time,
+		skbaddr)


^ permalink raw reply related

* Re: [RFC PATCH v7 01/19] Add a new structure for skb buffer from external.
From: Herbert Xu @ 2010-06-24 10:08 UTC (permalink / raw)
  To: Dong, Eddie
  Cc: Xin, Xiaohui, Stephen Hemminger, netdev@vger.kernel.org,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mst@redhat.com,
	mingo@elte.hu, davem@davemloft.net, jdike@linux.intel.com
In-Reply-To: <1A42CE6F5F474C41B63392A5F80372B21F58CE7F@shsmsx501.ccr.corp.intel.com>

On Wed, Jun 23, 2010 at 06:05:41PM +0800, Dong, Eddie wrote:
> 
> I mean once the frontend side driver post the buffers to the backend driver, the backend driver will "immediately" use that buffers to compose skb or gro_frags and post them to the assigned host NIC driver as receive buffers. In that case, if the backend driver recieves a packet from the NIC that requires to do copy, it may be unable to find additional free guest buffer because all of them are already used by the NIC driver. We have to reserve some guest buffers for the possible copy even if the buffer address is not identified by original skb :(

OK I see what you mean.  Can you tell me how does Xiaohui's
previous patch-set deal with this problem?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH net-next-2.6] net: u64_stats_sync improvements
From: Eric Dumazet @ 2010-06-24 10:04 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

- Add a comment about interrupts:

6) If counter might be written by an interrupt, readers should block
interrupts.

- Fix a typo in sample of use.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/u64_stats_sync.h |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index d050515..201d319 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -23,6 +23,10 @@
  *    pure reads. But if they have to fetch many values, it's better to not allow
  *    preemptions/interruptions to avoid many retries.
  *
+ * 6) If counter might be written by an interrupt, readers should block interrupts.
+ *    (On UP, there is no seqcount_t protection, a reader allowing interrupts could
+ *     read partial values)
+ *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -46,7 +50,7 @@
  *         start = u64_stats_fetch_begin(&stats->syncp);
  *         tbytes = stats->bytes64; // non atomic operation
  *         tpackets = stats->packets64; // non atomic operation
- * } while (u64_stats_fetch_retry(&stats->lock, syncp));
+ * } while (u64_stats_fetch_retry(&stats->syncp, start));
  *
  *
  * Example of use in drivers/net/loopback.c, using per_cpu containers,



^ permalink raw reply related

* [PATCH net-next-2.6] net: use this_cpu_ptr()
From: Eric Dumazet @ 2010-06-24 10:52 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

use this_cpu_ptr(p) instead of per_cpu_ptr(p, smp_processor_id())

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/flow.c     |    4 ++--
 net/ipv4/ip_input.c |    2 +-
 net/ipv4/tcp.c      |    2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/flow.c b/net/core/flow.c
index 1619006..8c7c91a 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -222,7 +222,7 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 	unsigned int hash;
 
 	local_bh_disable();
-	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	fcp = this_cpu_ptr(fc->percpu);
 
 	fle = NULL;
 	flo = NULL;
@@ -302,7 +302,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
 
-	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	fcp = this_cpu_ptr(fc->percpu);
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		hlist_for_each_entry_safe(fle, entry, tmp,
 					  &fcp->hash_table[i], u.hlist) {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index db47a5a..d859bcc 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -342,7 +342,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (unlikely(skb_dst(skb)->tclassid)) {
-		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
+		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
 		u32 idx = skb_dst(skb)->tclassid;
 		st[idx&0xFF].o_packets++;
 		st[idx&0xFF].o_bytes += skb->len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 779d40c..b9e721c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2958,7 +2958,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 	spin_unlock(&tcp_md5sig_pool_lock);
 
 	if (p)
-		return *per_cpu_ptr(p, smp_processor_id());
+		return *this_cpu_ptr(p);
 
 	local_bh_enable();
 	return NULL;



^ permalink raw reply related

* [PATCH net-next-2.6 2/4] net: u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh()
From: Eric Dumazet @ 2010-06-24 10:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

- Must disable preemption in case of 32bit UP in u64_stats_fetch_begin()
and u64_stats_fetch_retry()

- Add new u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh() for
network usage, disabling BH on 32bit UP only.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/u64_stats_sync.h |   59 +++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 201d319..00c1592 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -27,6 +27,9 @@
  *    (On UP, there is no seqcount_t protection, a reader allowing interrupts could
  *     read partial values)
  *
+ * 7) For softirq uses, readers can use u64_stats_fetch_begin_bh() and
+ *    u64_stats_fetch_retry_bh() helpers
+ *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -58,54 +61,80 @@
  */
 #include <linux/seqlock.h>
 
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 struct u64_stats_sync {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	seqcount_t	seq;
+#endif
 };
 
 static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&syncp->seq);
+#endif
 }
 
 static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_end(&syncp->seq);
+#endif
 }
 
 static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
+#else
+#if BITS_PER_LONG==32
+	preempt_disable();
+#endif
+	return 0;
+#endif
 }
 
 static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_retry(&syncp->seq, start);
-}
-
 #else
-struct u64_stats_sync {
-};
-
-static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
-{
-}
-
-static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
-{
+#if BITS_PER_LONG==32
+	preempt_enable();
+#endif
+	return false;
+#endif
 }
 
-static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+/*
+ * In case softirq handlers can update u64 counters, readers can use following helpers
+ * - SMP 32bit arches use seqcount protection, irq safe.
+ * - UP 32bit must disable BH.
+ * - 64bit have no problem atomically reading u64 values, irq safe.
+ */
+static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_begin(&syncp->seq);
+#else
+#if BITS_PER_LONG==32
+	local_bh_disable();
+#endif
 	return 0;
+#endif
 }
 
-static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+static bool inline u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_retry(&syncp->seq, start);
+#else
+#if BITS_PER_LONG==32
+	local_bh_enable();
+#endif
 	return false;
-}
 #endif
+}
 
 #endif /* _LINUX_U64_STATS_SYNC_H */



^ permalink raw reply related

* [PATCH net-next-2.6 3/4] macvlan: 64 bit rx counters
From: Eric Dumazet @ 2010-06-24 10:54 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev

Use u64_stats_sync infrastructure to implement 64bit stats.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/macvlan.c      |   37 +++++++++++++++++++++--------------
 include/linux/if_macvlan.h |   19 +++++++++++------
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e096875..e6d626e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -431,29 +431,38 @@ static void macvlan_uninit(struct net_device *dev)
 	free_percpu(vlan->rx_stats);
 }
 
-static struct net_device_stats *macvlan_dev_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev)
 {
-	struct net_device_stats *stats = &dev->stats;
+	struct rtnl_link_stats64 *stats = &dev->stats64;
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	dev_txq_stats_fold(dev, stats);
+	dev_txq_stats_fold(dev, &dev->stats);
 
 	if (vlan->rx_stats) {
-		struct macvlan_rx_stats *p, rx = {0};
+		struct macvlan_rx_stats *p, accum = {0};
+		u64 rx_packets, rx_bytes, rx_multicast;
+		unsigned int start;
 		int i;
 
 		for_each_possible_cpu(i) {
 			p = per_cpu_ptr(vlan->rx_stats, i);
-			rx.rx_packets += p->rx_packets;
-			rx.rx_bytes   += p->rx_bytes;
-			rx.rx_errors  += p->rx_errors;
-			rx.multicast  += p->multicast;
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rx_packets	= p->rx_packets;
+				rx_bytes	= p->rx_bytes;
+				rx_multicast	= p->rx_multicast;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+			accum.rx_packets	+= rx_packets;
+			accum.rx_bytes		+= rx_bytes;
+			accum.rx_multicast	+= rx_multicast;
+			/* rx_errors is an ulong, updated without syncp protection */
+			accum.rx_errors		+= p->rx_errors;
 		}
-		stats->rx_packets = rx.rx_packets;
-		stats->rx_bytes   = rx.rx_bytes;
-		stats->rx_errors  = rx.rx_errors;
-		stats->rx_dropped = rx.rx_errors;
-		stats->multicast  = rx.multicast;
+		stats->rx_packets = accum.rx_packets;
+		stats->rx_bytes   = accum.rx_bytes;
+		stats->rx_errors  = accum.rx_errors;
+		stats->rx_dropped = accum.rx_errors;
+		stats->multicast  = accum.rx_multicast;
 	}
 	return stats;
 }
@@ -502,7 +511,7 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_change_rx_flags	= macvlan_change_rx_flags,
 	.ndo_set_mac_address	= macvlan_set_mac_address,
 	.ndo_set_multicast_list	= macvlan_set_multicast_list,
-	.ndo_get_stats		= macvlan_dev_get_stats,
+	.ndo_get_stats64	= macvlan_dev_get_stats64,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c26a0e4..e24ce6e 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <net/netlink.h>
+#include <linux/u64_stats_sync.h>
 
 #if defined(CONFIG_MACVTAP) || defined(CONFIG_MACVTAP_MODULE)
 struct socket *macvtap_get_socket(struct file *);
@@ -27,14 +28,16 @@ struct macvtap_queue;
  *	struct macvlan_rx_stats - MACVLAN percpu rx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
- *	@multicast: number of received multicast packets
+ *	@rx_multicast: number of received multicast packets
+ *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
  */
 struct macvlan_rx_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long multicast;
-	unsigned long rx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	struct u64_stats_sync	syncp;
+	unsigned long		rx_errors;
 };
 
 struct macvlan_dev {
@@ -56,12 +59,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 {
 	struct macvlan_rx_stats *rx_stats;
 
-	rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+	rx_stats = this_cpu_ptr(vlan->rx_stats);
 	if (likely(success)) {
+		u64_stats_update_begin(&rx_stats->syncp);
 		rx_stats->rx_packets++;;
 		rx_stats->rx_bytes += len;
 		if (multicast)
-			rx_stats->multicast++;
+			rx_stats->rx_multicast++;
+		u64_stats_update_end(&rx_stats->syncp);
 	} else {
 		rx_stats->rx_errors++;
 	}



^ permalink raw reply related

* [PATCH net-next-2.6 4/4] vlan: 64 bit rx counters
From: Eric Dumazet @ 2010-06-24 10:55 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev

Use u64_stats_sync infrastructure to implement 64bit rx stats.

(tx stats are addressed later)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/8021q/vlan.h      |   13 ++++++-----
 net/8021q/vlan_core.c |    7 +++---
 net/8021q/vlan_dev.c  |   46 +++++++++++++++++++++++++---------------
 3 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 6abdcac..8d9503a 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -2,6 +2,7 @@
 #define __BEN_VLAN_802_1Q_INC__
 
 #include <linux/if_vlan.h>
+#include <linux/u64_stats_sync.h>
 
 
 /**
@@ -21,14 +22,16 @@ struct vlan_priority_tci_mapping {
  *	struct vlan_rx_stats - VLAN percpu rx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
- *	@multicast: number of received multicast packets
+ *	@rx_multicast: number of received multicast packets
+ *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
  */
 struct vlan_rx_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long multicast;
-	unsigned long rx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	struct u64_stats_sync	syncp;
+	unsigned long		rx_errors;
 };
 
 /**
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 50f58f5..1b9406a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -41,9 +41,9 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 	skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
 
-	rx_stats = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats,
-			       smp_processor_id());
+	rx_stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats);
 
+	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
 	rx_stats->rx_bytes += skb->len;
 
@@ -51,7 +51,7 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 	case PACKET_BROADCAST:
 		break;
 	case PACKET_MULTICAST:
-		rx_stats->multicast++;
+		rx_stats->rx_multicast++;
 		break;
 	case PACKET_OTHERHOST:
 		/* Our lower layer thinks this is not local, let's make sure.
@@ -62,6 +62,7 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 			skb->pkt_type = PACKET_HOST;
 		break;
 	}
+	u64_stats_update_end(&rx_stats->syncp);
 	return 0;
 }
 
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 5298426..c6456cb 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -166,6 +166,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 
 	rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats,
 			       smp_processor_id());
+	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
 	rx_stats->rx_bytes += skb->len;
 
@@ -182,7 +183,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		break;
 
 	case PACKET_MULTICAST:
-		rx_stats->multicast++;
+		rx_stats->rx_multicast++;
 		break;
 
 	case PACKET_OTHERHOST:
@@ -197,6 +198,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	default:
 		break;
 	}
+	u64_stats_update_end(&rx_stats->syncp);
 
 	vlan_set_encap_proto(skb, vhdr);
 
@@ -801,27 +803,37 @@ static u32 vlan_ethtool_get_flags(struct net_device *dev)
 	return dev_ethtool_get_flags(vlan->real_dev);
 }
 
-static struct net_device_stats *vlan_dev_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev)
 {
-	struct net_device_stats *stats = &dev->stats;
+	struct rtnl_link_stats64 *stats = &dev->stats64;
 
-	dev_txq_stats_fold(dev, stats);
+	dev_txq_stats_fold(dev, &dev->stats);
 
 	if (vlan_dev_info(dev)->vlan_rx_stats) {
-		struct vlan_rx_stats *p, rx = {0};
+		struct vlan_rx_stats *p, accum = {0};
 		int i;
 
 		for_each_possible_cpu(i) {
+			u64 rxpackets, rxbytes, rxmulticast;
+			unsigned int start;
+
 			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats, i);
-			rx.rx_packets += p->rx_packets;
-			rx.rx_bytes   += p->rx_bytes;
-			rx.rx_errors  += p->rx_errors;
-			rx.multicast  += p->multicast;
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rxpackets	= p->rx_packets;
+				rxbytes		= p->rx_bytes;
+				rxmulticast	= p->rx_multicast;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+			accum.rx_packets += rxpackets;
+			accum.rx_bytes   += rxbytes;
+			accum.rx_multicast += rxmulticast;
+			/* rx_errors is an ulong, not protected by syncp */
+			accum.rx_errors  += p->rx_errors;
 		}
-		stats->rx_packets = rx.rx_packets;
-		stats->rx_bytes   = rx.rx_bytes;
-		stats->rx_errors  = rx.rx_errors;
-		stats->multicast  = rx.multicast;
+		stats->rx_packets = accum.rx_packets;
+		stats->rx_bytes   = accum.rx_bytes;
+		stats->rx_errors  = accum.rx_errors;
+		stats->multicast  = accum.rx_multicast;
 	}
 	return stats;
 }
@@ -848,7 +860,7 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -872,7 +884,7 @@ static const struct net_device_ops vlan_netdev_accel_ops = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -897,7 +909,7 @@ static const struct net_device_ops vlan_netdev_ops_sq = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -922,7 +934,7 @@ static const struct net_device_ops vlan_netdev_accel_ops_sq = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,



^ permalink raw reply related

* [PATCH net-next-2.6] tcp: tso_fragment() might avoid GFP_ATOMIC
From: Eric Dumazet @ 2010-06-24 11:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

We can pass a gfp argument to tso_fragment() and avoid GFP_ATOMIC
allocations sometimes.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/tcp_output.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 51d316d..25ff62e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1460,7 +1460,7 @@ int tcp_may_send_now(struct sock *sk)
  * packet has never been sent out before (and thus is not cloned).
  */
 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
-			unsigned int mss_now)
+			unsigned int mss_now, gfp_t gfp)
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
@@ -1470,7 +1470,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	if (skb->len != skb->data_len)
 		return tcp_fragment(sk, skb, len, mss_now);
 
-	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
+	buff = sk_stream_alloc_skb(sk, 0, gfp);
 	if (unlikely(buff == NULL))
 		return -ENOMEM;
 
@@ -1768,7 +1768,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						    cwnd_quota);
 
 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;



^ permalink raw reply related

* [2.6.35-rc3] NFS: possible irq lock inversion dependency
From: Tetsuo Handa @ 2010-06-24 11:53 UTC (permalink / raw)
  To: linux-fsdevel, netdev

Hello.

I sometimes get below warning when the system is about to reboot/halt.
Is this already reported? If not, I'll try to establish steps to reproduce.

----- Dump 1 -----

[  508.594713] nfsd: last server has exited, flushing export cache
[  509.100525] 
[  509.100529] =========================================================
[  509.102129] [ INFO: possible irq lock inversion dependency detected ]
<4>[  509.102513]                                       [<c103f69f>] sys_exit_group+0xf/0x20
<4>[  509.102513]                                       [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]  }
[  509.102513]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  509.102513]  ... acquired at:
[  509.102513]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]    [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]    [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]    [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]    [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]    [<c1052a65>] kthread+0x75/0x80
[  509.102513]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513] 
[  509.102513] 
[  509.102513] stack backtrace:
[  509.102513] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  509.102513] Call Trace:
[  509.102513]  [<c103cbe8>] ? printk+0x18/0x20
[  509.102513]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  509.102513]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  509.102513]  [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]  [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  509.102513]  [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  509.102513]  [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  509.102513]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  509.102513]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1035456>] ? complete+0x46/0x60
[  509.102513]  [<c1052a65>] kthread+0x75/0x80
[  509.102513]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c100317a>] kernel_thread_helper+0x6/0x1c
9>] _raw_spin_lock+0x39/0x70
[  509.102513]                                          [<c128ae51>] sk_clone+0xb1/0x2c0
[  509.102513]                                          [<c12c0046>] inet_csk_clone+0x16/0x90
[  509.102513]                                          [<c12d519c>] tcp_create_openreq_child+0x1c/0x460
[  509.102513]                                          [<c12d2a1f>] tcp_v4_syn_recv_sock+0x3f/0x1e0
[  509.102513]                                          [<c12d576c>] tcp_check_req+0x18c/0x3b0
[  509.102513]                                          [<c12d2c0d>] tcp_v4_hnd_req+0x4d/0x160
[  509.102513]                                          [<c12d2f39>] tcp_v4_do_rcv+0x159/0x280
[  509.102513]                                          [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  509.102513]                                          [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  509.102513]                                          [<c12b5e40>] ip_local_deliver+0x30/0x40
[  509.102513]                                          [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  509.102513]                                          [<c12b63f6>] ip_rcv+0x166/0x240
[  509.102513]                                          [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  509.102513]                                          [<c12985e8>] process_backlog+0x88/0x160
[  509.102513]                                          [<c12989b7>] net_rx_action+0x127/0x140
[  509.102513]                                          [<c1041b30>] __do_softirq+0xd0/0x130
[  509.102513]     INITIAL USE at:
[  509.102513]                                         [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  509.102513]                                         [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                         [<c1331dbe>] _raw_spin_lock_bh+0x3e/0x80
[  509.102513]                                         [<c128c4c9>] lock_sock_fast+0x29/0x90
[  509.102513]                                         [<c12dae84>] udp_destroy_sock+0x14/0x40
[  509.102513]                                         [<c128c823>] sk_common_release+0xb3/0xc0
[  509.102513]                                         [<c12db7f8>] udp_lib_close+0x8/0x10
[  509.102513]                                         [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]                                         [<c1286c36>] sock_release+0x66/0x80
[  509.102513]                                         [<c1287952>] sock_close+0x12/0x30
[  509.102513]                                         [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]                                         [<c10b70b9>] fput+0x19/0x20
[  509.102513]                                         [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]                                         [<c103eafb>] close_files+0xab/0x140
[  509.102513]                                         [<c103ebf9>] put_files_struct+0x29/0xf0
[  509.102513]                                         [<c103ed50>] exit_files+0x40/0x50
[  509.102513]                                         [<c103f400>] do_exit+0x100/0x2b0
[  509.102513]                                         [<c103f614>] do_group_exit+0x34/0xb0
[  509.102513]                                         [<c103f69f>] sys_exit_group+0xf/0x20
[  509.102513]                                         [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]   }
[  509.102513]   ... key      at: [<c1cbfc50>] af_family_slock_keys+0x10/0x140
[  509.102513]   ... acquired at:
[  509.102513]    [<c1064a8b>] check_prevs_add+0xab/0x100
[  509.102513]    [<c1064e15>] validate_chain+0x305/0x5a0
[  509.102513]    [<c1066f03>] __lock_acquire+0x2b3/0x8e0
[  509.102513]    [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]    [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  509.102513]    [<c12c0356>] inet_csk_listen_stop+0x86/0x160
[  509.102513]    [<c12c2dc0>] tcp_close+0x350/0x360
[  509.102513]    [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]    [<c1286c36>] sock_release+0x66/0x80
[  509.102513]    [<c1287952>] sock_close+0x12/0x30
[  509.102513]    [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]    [<c10b70b9>] fput+0x19/0x20
[  509.102513]    [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]    [<c10b558d>] sys_close+0x6d/0x100
[  509.102513]    [<c13327e1>] syscall_call+0x7/0xb
[  509.102513] 
[  509.102513] -> (clock-AF_INET){++.?..} ops: 877 {
[  509.102513]    HARDIRQ-ON-W at:
[  509.102513]                                        [<c106625e>] mark_irqflags+0xfe/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  509.102513]                                        [<c128c79f>] sk_common_release+0x2f/0xc0
[  509.102513]                                        [<c12db7f8>] udp_lib_close+0x8/0x10
[  509.102513]                                        [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]                                        [<c1286c36>] sock_release+0x66/0x80
[  509.102513]                                        [<c1287952>] sock_close+0x12/0x30
[  509.102513]                                        [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]                                        [<c10b70b9>] fput+0x19/0x20
[  509.102513]                                        [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]                                        [<c103eafb>] close_files+0xab/0x140
[  509.102513]                                        [<c103ebf9>] put_files_struct+0x29/0xf0
[  509.102513]                                        [<c103ed50>] exit_files+0x40/0x50
[  509.102513]                                        [<c103f400>] do_exit+0x100/0x2b0
[  509.102513]                                        [<c103f614>] do_group_exit+0x34/0xb0
[  509.102513]                                        [<c103f69f>] sys_exit_group+0xf/0x20
[  509.102513]                                        [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]    HARDIRQ-ON-R at:
[  509.102513]                                        [<c10661ce>] mark_irqflags+0x6e/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]                                        [<c12cbea8>] tcp_rcv_synsent_state_process+0x388/0x580
[  509.102513]                                        [<c12cc547>] tcp_rcv_state_process+0x4a7/0x560
[  509.102513]                                        [<c12d2e51>] tcp_v4_do_rcv+0x71/0x280
[  509.102513]                                        [<c128b786>] __release_sock+0x66/0x150
[  509.102513]                                        [<c128c497>] release_sock+0x87/0x90
[  509.102513]                                        [<c12e1cba>] inet_stream_connect+0x5a/0x1b0
[  509.102513]                                        [<c1289448>] kernel_connect+0x18/0x30
[  509.102513]                                        [<c130acce>] xs_tcp_finish_connecting+0x4e/0x120
[  509.102513]                                        [<c130adfb>] xs_tcp_setup_socket+0x5b/0x180
[  509.102513]                                        [<c130b034>] xs_tcp_connect_worker4+0x14/0x20
[  509.102513]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]                                        [<c1052a65>] kthread+0x75/0x80
[  509.102513]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513]    IN-SOFTIRQ-R at:
[  509.102513]                                        [<c106627e>] mark_irqflags+0x11e/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]                                        [<c1309f81>] xs_tcp_data_ready+0x21/0x90
[  509.102513]                                        [<c12ca378>] tcp_data_queue+0x248/0x820
[  509.102513]                                        [<c12cb6ee>] tcp_rcv_established+0xae/0x4e0
[  509.102513]                                        [<c12d2fb1>] tcp_v4_do_rcv+0x1d1/0x280
[  509.102513]                                        [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  509.102513]                                        [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  509.102513]                                        [<c12b5e40>] ip_local_deliver+0x30/0x40
[  509.102513]                                        [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  509.102513]                                        [<c12b63f6>] ip_rcv+0x166/0x240
[  509.102513]                                        [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  509.102513]                                        [<c12985e8>] process_backlog+0x88/0x160
[  509.102513]                                        [<c12989b7>] net_rx_action+0x127/0x140
[  509.102513]                                        [<c1041b30>] __do_softirq+0xd0/0x130
[  509.102513]    SOFTIRQ-ON-R at:
[  509.102513]                                        [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]                                        [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]                                        [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]                                        [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]                                        [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]                                        [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]                                        [<c1052a65>] kthread+0x75/0x80
[  509.102513]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513]    INITIAL USE at:
[  509.102513]                                       [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  509.102513]                                       [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                       [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  509.102513]                                       [<c128c79f>] sk_common_release+0x2f/0xc0
[  509.102513]                                       [<c12db7f8>] udp_lib_close+0x8/0x10
[  509.102513]                                       [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]                                       [<c1286c36>] sock_release+0x66/0x80
[  509.102513]                                       [<c1287952>] sock_close+0x12/0x30
[  509.102513]                                       [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]                                       [<c10b70b9>] fput+0x19/0x20
[  509.102513]                                       [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]                                       [<c103eafb>] close_files+0xab/0x140
[  509.102513]                                       [<c103ebf9>] put_files_struct+0x29/0xf0
[  509.102513]                                       [<c103ed50>] exit_files+0x40/0x50
[  509.102513]                                       [<c103f400>] do_exit+0x100/0x2b0
[  509.102513]                                       [<c103f614>] do_group_exit+0x34/0xb0
[  509.102513]                                       [<c103f69f>] sys_exit_group+0xf/0x20
[  509.102513]                                       [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]  }
[  509.102513]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  509.102513]  ... acquired at:
[  509.102513]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]    [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]    [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]    [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]    [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]    [<c1052a65>] kthread+0x75/0x80
[  509.102513]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513] 
[  509.102513] 
[  509.102513] stack backtrace:
[  509.102513] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  509.102513] Call Trace:
[  509.102513]  [<c103cbe8>] ? printk+0x18/0x20
[  509.102513]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  509.102513]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  509.102513]  [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]  [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  509.102513]  [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  509.102513]  [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  509.102513]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  509.102513]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1035456>] ? complete+0x46/0x60
[  509.102513]  [<c1052a65>] kthread+0x75/0x80
[  509.102513]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c100317a>] kernel_thread_helper+0x6/0x1c
[  518.099361] ACPI: Preparing to enter system sleep state S5
[  518.101223] Disabling non-boot CPUs ...
[  518.607480] lockdep: fixing up alternatives.
[  518.608334] SMP alternatives: switching to UP code
[  518.908243] Power down.
[  518.909864] acpi_power_off called




----- Dump 2 -----

[  974.096047] nfsd: last server has exited, flushing export cache
[  975.514620] 
[  975.514622] =========================================================
[  975.516172] [ INFO: possible irq lock inversion dependency detected ]
x34/0xb0
<4>[  975.517504]                                       [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517507]                                       [<c13327e1>] syscall_call+0x7/0xb
[  975.517510]  }
[  975.517511]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  975.517513]  ... acquired at:
[  975.517515]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517517]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517519]    [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517522]    [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517524]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517526]    [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517529]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517531]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517534]    [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517536]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517539]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517541]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517544]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517546]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517549]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517551]    [<c1052a65>] kthread+0x75/0x80
[  975.517553]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517556] 
[  975.517570] 
[  975.517571] stack backtrace:
[  975.517628] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  975.517648] Call Trace:
[  975.517692]  [<c103cbe8>] ? printk+0x18/0x20
[  975.517704]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  975.517708]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517711]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517713]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  975.517716]  [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517718]  [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517721]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517723]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  975.517726]  [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517729]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517732]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517735]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517737]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517740]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  975.517743]  [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517745]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517748]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517751]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517753]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517756]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517758]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517761]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517763]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  975.517766]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  975.517769]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517771]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517774]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517779]  [<c1035456>] ? complete+0x46/0x60
[  975.517782]  [<c1052a65>] kthread+0x75/0x80
[  975.517784]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  975.517786]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517789]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517791]  [<c100317a>] kernel_thread_helper+0x6/0x1c
9>] _raw_spin_lock+0x39/0x70
[  975.517015]                                          [<c128ae51>] sk_clone+0xb1/0x2c0
[  975.517018]                                          [<c12c0046>] inet_csk_clone+0x16/0x90
[  975.517022]                                          [<c12d519c>] tcp_create_openreq_child+0x1c/0x460
[  975.517025]                                          [<c12d2a1f>] tcp_v4_syn_recv_sock+0x3f/0x1e0
[  975.517028]                                          [<c12d576c>] tcp_check_req+0x18c/0x3b0
[  975.517031]                                          [<c12d2c0d>] tcp_v4_hnd_req+0x4d/0x160
[  975.517033]                                          [<c12d2f39>] tcp_v4_do_rcv+0x159/0x280
[  975.517036]                                          [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  975.517038]                                          [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  975.517042]                                          [<c12b5e40>] ip_local_deliver+0x30/0x40
[  975.517045]                                          [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  975.517048]                                          [<c12b63f6>] ip_rcv+0x166/0x240
[  975.517050]                                          [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  975.517054]                                          [<c12985e8>] process_backlog+0x88/0x160
[  975.517057]                                          [<c12989b7>] net_rx_action+0x127/0x140
[  975.517060]                                          [<c1041b30>] __do_softirq+0xd0/0x130
[  975.517063]     INITIAL USE at:
[  975.517065]                                         [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  975.517067]                                         [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517070]                                         [<c1331dbe>] _raw_spin_lock_bh+0x3e/0x80
[  975.517073]                                         [<c128c4c9>] lock_sock_fast+0x29/0x90
[  975.517076]                                         [<c12dae84>] udp_destroy_sock+0x14/0x40
[  975.517079]                                         [<c128c823>] sk_common_release+0xb3/0xc0
[  975.517083]                                         [<c12db7f8>] udp_lib_close+0x8/0x10
[  975.517085]                                         [<c12e18ce>] inet_release+0xbe/0x100
[  975.517088]                                         [<c1286c36>] sock_release+0x66/0x80
[  975.517091]                                         [<c1287952>] sock_close+0x12/0x30
[  975.517094]                                         [<c10b705b>] __fput+0x1cb/0x210
[  975.517097]                                         [<c10b70b9>] fput+0x19/0x20
[  975.517099]                                         [<c10b54f3>] filp_close+0x43/0x70
[  975.517102]                                         [<c103eafb>] close_files+0xab/0x140
[  975.517105]                                         [<c103ebf9>] put_files_struct+0x29/0xf0
[  975.517108]                                         [<c103ed50>] exit_files+0x40/0x50
[  975.517111]                                         [<c103f400>] do_exit+0x100/0x2b0
[  975.517114]                                         [<c103f614>] do_group_exit+0x34/0xb0
[  975.517117]                                         [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517120]                                         [<c13327e1>] syscall_call+0x7/0xb
[  975.517130]   }
[  975.517139]   ... key      at: [<c1cbfc50>] af_family_slock_keys+0x10/0x140
[  975.517180]   ... acquired at:
[  975.517187]    [<c1064a8b>] check_prevs_add+0xab/0x100
[  975.517190]    [<c1064e15>] validate_chain+0x305/0x5a0
[  975.517193]    [<c1066f03>] __lock_acquire+0x2b3/0x8e0
[  975.517195]    [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517198]    [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  975.517200]    [<c12c0356>] inet_csk_listen_stop+0x86/0x160
[  975.517203]    [<c12c2dc0>] tcp_close+0x350/0x360
[  975.517205]    [<c12e18ce>] inet_release+0xbe/0x100
[  975.517208]    [<c1286c36>] sock_release+0x66/0x80
[  975.517210]    [<c1287952>] sock_close+0x12/0x30
[  975.517212]    [<c10b705b>] __fput+0x1cb/0x210
[  975.517215]    [<c10b70b9>] fput+0x19/0x20
[  975.517217]    [<c10b54f3>] filp_close+0x43/0x70
[  975.517220]    [<c10b558d>] sys_close+0x6d/0x100
[  975.517222]    [<c13327e1>] syscall_call+0x7/0xb
[  975.517230] 
[  975.517240] -> (clock-AF_INET){++.?..} ops: 879 {
[  975.517244]    HARDIRQ-ON-W at:
[  975.517246]                                        [<c106625e>] mark_irqflags+0xfe/0x180
[  975.517249]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517252]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517254]                                        [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  975.517257]                                        [<c128c79f>] sk_common_release+0x2f/0xc0
[  975.517260]                                        [<c12db7f8>] udp_lib_close+0x8/0x10
[  975.517263]                                        [<c12e18ce>] inet_release+0xbe/0x100
[  975.517266]                                        [<c1286c36>] sock_release+0x66/0x80
[  975.517269]                                        [<c1287952>] sock_close+0x12/0x30
[  975.517271]                                        [<c10b705b>] __fput+0x1cb/0x210
[  975.517274]                                        [<c10b70b9>] fput+0x19/0x20
[  975.517277]                                        [<c10b54f3>] filp_close+0x43/0x70
[  975.517280]                                        [<c103eafb>] close_files+0xab/0x140
[  975.517283]                                        [<c103ebf9>] put_files_struct+0x29/0xf0
[  975.517286]                                        [<c103ed50>] exit_files+0x40/0x50
[  975.517289]                                        [<c103f400>] do_exit+0x100/0x2b0
[  975.517291]                                        [<c103f614>] do_group_exit+0x34/0xb0
[  975.517294]                                        [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517297]                                        [<c13327e1>] syscall_call+0x7/0xb
[  975.517300]    HARDIRQ-ON-R at:
[  975.517302]                                        [<c10661ce>] mark_irqflags+0x6e/0x180
[  975.517304]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517307]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517310]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517312]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517316]                                        [<c12cbea8>] tcp_rcv_synsent_state_process+0x388/0x580
[  975.517319]                                        [<c12cc547>] tcp_rcv_state_process+0x4a7/0x560
[  975.517322]                                        [<c12d2e51>] tcp_v4_do_rcv+0x71/0x280
[  975.517325]                                        [<c128b786>] __release_sock+0x66/0x150
[  975.517327]                                        [<c128c497>] release_sock+0x87/0x90
[  975.517330]                                        [<c12e1cba>] inet_stream_connect+0x5a/0x1b0
[  975.517333]                                        [<c1289448>] kernel_connect+0x18/0x30
[  975.517336]                                        [<c130acce>] xs_tcp_finish_connecting+0x4e/0x120
[  975.517339]                                        [<c130adfb>] xs_tcp_setup_socket+0x5b/0x180
[  975.517342]                                        [<c130b034>] xs_tcp_connect_worker4+0x14/0x20
[  975.517344]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517347]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517350]                                        [<c1052a65>] kthread+0x75/0x80
[  975.517353]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517358]    IN-SOFTIRQ-R at:
[  975.517359]                                        [<c106627e>] mark_irqflags+0x11e/0x180
[  975.517364]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517367]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517369]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517372]                                        [<c1309f81>] xs_tcp_data_ready+0x21/0x90
[  975.517375]                                        [<c12ca378>] tcp_data_queue+0x248/0x820
[  975.517378]                                        [<c12cb6ee>] tcp_rcv_established+0xae/0x4e0
[  975.517381]                                        [<c12d2fb1>] tcp_v4_do_rcv+0x1d1/0x280
[  975.517384]                                        [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  975.517386]                                        [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  975.517389]                                        [<c12b5e40>] ip_local_deliver+0x30/0x40
[  975.517392]                                        [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  975.517395]                                        [<c12b63f6>] ip_rcv+0x166/0x240
[  975.517398]                                        [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  975.517401]                                        [<c12985e8>] process_backlog+0x88/0x160
[  975.517404]                                        [<c12989b7>] net_rx_action+0x127/0x140
[  975.517407]                                        [<c1041b30>] __do_softirq+0xd0/0x130
[  975.517411]    SOFTIRQ-ON-R at:
[  975.517412]                                        [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517415]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517418]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517421]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517423]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517427]                                        [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517429]                                        [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517432]                                        [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517435]                                        [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517438]                                        [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517442]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517445]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517448]                                        [<c1052a65>] kthread+0x75/0x80
[  975.517450]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517453]    INITIAL USE at:
[  975.517455]                                       [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  975.517457]                                       [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517460]                                       [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  975.517463]                                       [<c128c79f>] sk_common_release+0x2f/0xc0
[  975.517466]                                       [<c12db7f8>] udp_lib_close+0x8/0x10
[  975.517469]                                       [<c12e18ce>] inet_release+0xbe/0x100
[  975.517471]                                       [<c1286c36>] sock_release+0x66/0x80
[  975.517474]                                       [<c1287952>] sock_close+0x12/0x30
[  975.517477]                                       [<c10b705b>] __fput+0x1cb/0x210
[  975.517480]                                       [<c10b70b9>] fput+0x19/0x20
[  975.517483]                                       [<c10b54f3>] filp_close+0x43/0x70
[  975.517485]                                       [<c103eafb>] close_files+0xab/0x140
[  975.517488]                                       [<c103ebf9>] put_files_struct+0x29/0xf0
[  975.517495]                                       [<c103ed50>] exit_files+0x40/0x50
[  975.517498]                                       [<c103f400>] do_exit+0x100/0x2b0
[  975.517501]                                       [<c103f614>] do_group_exit+0x34/0xb0
[  975.517504]                                       [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517507]                                       [<c13327e1>] syscall_call+0x7/0xb
[  975.517510]  }
[  975.517511]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  975.517513]  ... acquired at:
[  975.517515]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517517]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517519]    [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517522]    [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517524]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517526]    [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517529]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517531]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517534]    [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517536]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517539]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517541]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517544]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517546]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517549]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517551]    [<c1052a65>] kthread+0x75/0x80
[  975.517553]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517556] 
[  975.517570] 
[  975.517571] stack backtrace:
[  975.517628] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  975.517648] Call Trace:
[  975.517692]  [<c103cbe8>] ? printk+0x18/0x20
[  975.517704]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  975.517708]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517711]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517713]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  975.517716]  [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517718]  [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517721]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517723]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  975.517726]  [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517729]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517732]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517735]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517737]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517740]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  975.517743]  [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517745]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517748]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517751]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517753]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517756]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517758]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517761]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517763]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  975.517766]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  975.517769]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517771]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517774]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517779]  [<c1035456>] ? complete+0x46/0x60
[  975.517782]  [<c1052a65>] kthread+0x75/0x80
[  975.517784]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  975.517786]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517789]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517791]  [<c100317a>] kernel_thread_helper+0x6/0x1c
[  984.626828] Restarting system.
[  984.627458] machine restart

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: Steffen Klassert @ 2010-06-24 12:05 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, netdev, Chase Douglas, Arne Nordmark
In-Reply-To: <1277337341.26161.18.camel@localhost>

Hi.

On Thu, Jun 24, 2010 at 12:55:41AM +0100, Ben Hutchings wrote:
> This avoids scheduling in atomic context and also means that IRQs
> will only be deferred for relatively short periods of time.
> 
> Previously discussed in:
> http://article.gmane.org/gmane.linux.network/155024
> 
> Reported-by: Arne Nordmark <nordmark@mech.kth.se>
> Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
> Tested-by: Arne Nordmark <nordmark@mech.kth.se> [against 2.6.32]
> ---
>  drivers/net/3c59x.c |   66 ++++++++++++++++++++++++++++++---------------------
>  1 files changed, 39 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
> index beddef9..f4a3fb1 100644
> --- a/drivers/net/3c59x.c
> +++ b/drivers/net/3c59x.c
> @@ -644,9 +644,15 @@ struct vortex_private {
>  	u16 deferred;						/* Resend these interrupts when we
>  										 * bale from the ISR */
>  	u16 io_size;						/* Size of PCI region (for release_region) */
> -	spinlock_t lock;					/* Serialise access to device & its vortex_private */
> -	struct mii_if_info mii;				/* MII lib hooks/info */
> -	int window;					/* Register window */
> +
> +	/* Serialises access to hardware other than MII and variables below.
> +	 * The lock hierarchy is rtnl_lock > lock > mii_lock > window_lock. */
> +	spinlock_t lock;
> +
> +	spinlock_t mii_lock;		/* Serialises access to MII */
> +	struct mii_if_info mii;		/* MII lib hooks/info */
> +	spinlock_t window_lock;		/* Serialises access to windowed regs */

You should initialize the new locks properly with spin_lock_init().

> +	int window;			/* Register window */
>  };
>  
>  static void window_set(struct vortex_private *vp, int window)
> @@ -661,15 +667,23 @@ static void window_set(struct vortex_private *vp, int window)
>  static u ## size							\
>  window_read ## size(struct vortex_private *vp, int window, int addr)	\
>  {									\
> +	unsigned long flags;						\
> +	u ## size ret;							\
> +	spin_lock_irqsave(&vp->window_lock, flags);			\
>  	window_set(vp, window);						\
> -	return ioread ## size(vp->ioaddr + addr);			\
> +	ret = ioread ## size(vp->ioaddr + addr);			\
> +	spin_unlock_irqrestore(&vp->window_lock, flags);		\
> +	return ret;							\
>  }									\
>  static void								\
>  window_write ## size(struct vortex_private *vp, u ## size value,	\
>  		     int window, int addr)				\
>  {									\
> +	unsigned long flags;						\
> +	spin_lock_irqsave(&vp->window_lock, flags);			\
>  	window_set(vp, window);						\
>  	iowrite ## size(value, vp->ioaddr + addr);			\
> +	spin_unlock_irqrestore(&vp->window_lock, flags);		\
>  }

This adds a lot of calls to spin_lock_irqsave/spin_unlock_irqrestore to many
places where this is not necessary at all. For example during device probe and
device open, window_read/window_write are called multiple times, each time
disabling the interrupts. I'd suggest to have unlocked, locked and irqsave
versions of window_read/window_write and use them in appropriate places.

>  DEFINE_WINDOW_IO(8)
>  DEFINE_WINDOW_IO(16)
> @@ -1784,7 +1798,6 @@ vortex_timer(unsigned long data)
>  		pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
>  	}
>  
> -	disable_irq_lockdep(dev->irq);
>  	media_status = window_read16(vp, 4, Wn4_Media);
>  	switch (dev->if_port) {
>  	case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
> @@ -1805,10 +1818,7 @@ vortex_timer(unsigned long data)
>  	case XCVR_MII: case XCVR_NWAY:
>  		{
>  			ok = 1;
> -			/* Interrupts are already disabled */
> -			spin_lock(&vp->lock);
>  			vortex_check_media(dev, 0);
> -			spin_unlock(&vp->lock);
>  		}
>  		break;
>  	  default:					/* Other media types handled by Tx timeouts. */
> @@ -1827,6 +1837,8 @@ vortex_timer(unsigned long data)
>  	if (!ok) {
>  		unsigned int config;
>  
> +		spin_lock_irq(&vp->lock);

This can still happen every 5 seconds if the NIC has no link beat and
medialock is not set. So what about defering this locked codepath to
a workqueue, or moving the whole vortex_timer to a delayed workqueue?
In this case we don't need to disable all the interrups on the cpu, we
could still use disable_irq then.

The rest looks quite good to me.

Thanks,

Steffen

^ permalink raw reply

* Re: Question about xfrm by MARK feature
From: jamal @ 2010-06-24 12:04 UTC (permalink / raw)
  To: Gerd v. Egidy; +Cc: timo.teras, kaber, herbert, netdev
In-Reply-To: <201006231803.17261.lists@egidy.de>

Hi Gerd,

On Wed, 2010-06-23 at 18:03 +0200, Gerd v. Egidy wrote:
> Hi Jamal,
> 
> while looking through the 2.6.34 changelog I found the xfrm by MARK feature 
> you developed in february. I'm currently working on NAT for ipsec connections 
> and thought your feature might help me.
> 
> For example I have 2 different remote networks with the same ip network each 
> and both of them have a tunnel to the same local network. 

It seems "Same IP network" means that two remote locations will have
exactly same IP address? This is hard of course - but nat may do it..
There's also the nat zones feature that Patrick introduced a while back
that may help you

> I map their IPs to 
> something different so I can distinguish them in the local network. But after 
> the nat the xfrm code sees two tunnels with exactly the same values. So this 
> can't work.
> 

Can you look at the incoming encrypted packet headers and tell if they
are from different remotes? If not, are different remotes coming in via
a different network device? If yes, you can install a tc rule to mark
them as they come in before decryption and that mark should stay with
them even after they get decrypted.

> But if I understood your feature correctly, I can now mark the packets (e.g. 
> in iptables with ... -j MARK --set-mark 1) and have xfrm select the correct 
> ipsec tunnel via the mark. Correct?
> 
> But does your feature also set the mark on packets decrypted by xfrm? I need 
> some way to find out from which tunnel the packet came to correctly treat it. 
> 

Refer to above and also to policy routing.

> Do you know if any of the ipsec solutions for linux (e.g. strongswan, 
> openswan, racoon) already have support for this feature or are developing on 
> it?

AFAIK, only iproute2 can use marks. I believe the ike daemons can be
made to use reqid (as Herbert mentioned) but i am not sure that is 
sufficient for what you want.

cheers,
jamal


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox