Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/7] au1000-eth: set MODULE_VERSION
From: Florian Fainelli @ 2010-04-07  8:08 UTC (permalink / raw)
  To: netdev; +Cc: David Miller


Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index 2963159..b96abc5 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -83,6 +83,7 @@ static int au1000_debug = 3;
 MODULE_AUTHOR(DRV_AUTHOR);
 MODULE_DESCRIPTION(DRV_DESC);
 MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
 
 /*
  * Theory of operation

^ permalink raw reply related

* [PATCH 3/7] au1000-eth: prefix all functions with au1000_
From: Florian Fainelli @ 2010-04-07  8:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller

In order to avoid namespace clashes, prefix all internal driver functions
with au1000_.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index b96abc5..7d40177 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -149,7 +149,7 @@ struct au1000_private *au_macs[NUM_ETH_INTERFACES];
  * specific irq-map
  */
 
-static void enable_mac(struct net_device *dev, int force_reset)
+static void au1000_enable_mac(struct net_device *dev, int force_reset)
 {
 	unsigned long flags;
 	struct au1000_private *aup = netdev_priv(dev);
@@ -237,7 +237,7 @@ static int au1000_mdiobus_read(struct mii_bus *bus, int phy_addr, int regnum)
 	 * _NOT_ hold (e.g. when PHY is accessed through other MAC's MII bus) */
 	struct net_device *const dev = bus->priv;
 
-	enable_mac(dev, 0); /* make sure the MAC associated with this
+	au1000_enable_mac(dev, 0); /* make sure the MAC associated with this
 			     * mii_bus is enabled */
 	return au1000_mdio_read(dev, phy_addr, regnum);
 }
@@ -247,7 +247,7 @@ static int au1000_mdiobus_write(struct mii_bus *bus, int phy_addr, int regnum,
 {
 	struct net_device *const dev = bus->priv;
 
-	enable_mac(dev, 0); /* make sure the MAC associated with this
+	au1000_enable_mac(dev, 0); /* make sure the MAC associated with this
 			     * mii_bus is enabled */
 	au1000_mdio_write(dev, phy_addr, regnum, value);
 	return 0;
@@ -257,12 +257,12 @@ static int au1000_mdiobus_reset(struct mii_bus *bus)
 {
 	struct net_device *const dev = bus->priv;
 
-	enable_mac(dev, 0); /* make sure the MAC associated with this
+	au1000_enable_mac(dev, 0); /* make sure the MAC associated with this
 			     * mii_bus is enabled */
 	return 0;
 }
 
-static void hard_stop(struct net_device *dev)
+static void au1000_hard_stop(struct net_device *dev)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 
@@ -273,7 +273,7 @@ static void hard_stop(struct net_device *dev)
 	au_sync_delay(10);
 }
 
-static void enable_rx_tx(struct net_device *dev)
+static void au1000_enable_rx_tx(struct net_device *dev)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 
@@ -320,7 +320,7 @@ au1000_adjust_link(struct net_device *dev)
 		// duplex mode changed
 
 		/* switching duplex mode requires to disable rx and tx! */
-		hard_stop(dev);
+		au1000_hard_stop(dev);
 
 		if (DUPLEX_FULL == phydev->duplex)
 			aup->mac->control = ((aup->mac->control
@@ -332,7 +332,7 @@ au1000_adjust_link(struct net_device *dev)
 					     | MAC_DISABLE_RX_OWN);
 		au_sync_delay(1);
 
-		enable_rx_tx(dev);
+		au1000_enable_rx_tx(dev);
 		aup->old_duplex = phydev->duplex;
 
 		status_change = 1;
@@ -363,7 +363,7 @@ au1000_adjust_link(struct net_device *dev)
 	}
 }
 
-static int mii_probe (struct net_device *dev)
+static int au1000_mii_probe (struct net_device *dev)
 {
 	struct au1000_private *const aup = netdev_priv(dev);
 	struct phy_device *phydev = NULL;
@@ -463,7 +463,7 @@ static int mii_probe (struct net_device *dev)
  * has the virtual and dma address of a buffer suitable for
  * both, receive and transmit operations.
  */
-static db_dest_t *GetFreeDB(struct au1000_private *aup)
+static db_dest_t *au1000_GetFreeDB(struct au1000_private *aup)
 {
 	db_dest_t *pDB;
 	pDB = aup->pDBfree;
@@ -474,7 +474,7 @@ static db_dest_t *GetFreeDB(struct au1000_private *aup)
 	return pDB;
 }
 
-void ReleaseDB(struct au1000_private *aup, db_dest_t *pDB)
+void au1000_ReleaseDB(struct au1000_private *aup, db_dest_t *pDB)
 {
 	db_dest_t *pDBfree = aup->pDBfree;
 	if (pDBfree)
@@ -482,12 +482,12 @@ void ReleaseDB(struct au1000_private *aup, db_dest_t *pDB)
 	aup->pDBfree = pDB;
 }
 
-static void reset_mac_unlocked(struct net_device *dev)
+static void au1000_reset_mac_unlocked(struct net_device *dev)
 {
 	struct au1000_private *const aup = netdev_priv(dev);
 	int i;
 
-	hard_stop(dev);
+	au1000_hard_stop(dev);
 
 	*aup->enable = MAC_EN_CLOCK_ENABLE;
 	au_sync_delay(2);
@@ -508,7 +508,7 @@ static void reset_mac_unlocked(struct net_device *dev)
 
 }
 
-static void reset_mac(struct net_device *dev)
+static void au1000_reset_mac(struct net_device *dev)
 {
 	struct au1000_private *const aup = netdev_priv(dev);
 	unsigned long flags;
@@ -519,7 +519,7 @@ static void reset_mac(struct net_device *dev)
 
 	spin_lock_irqsave(&aup->lock, flags);
 
-	reset_mac_unlocked (dev);
+	au1000_reset_mac_unlocked (dev);
 
 	spin_unlock_irqrestore(&aup->lock, flags);
 }
@@ -530,7 +530,7 @@ static void reset_mac(struct net_device *dev)
  * these are not descriptors sitting in memory.
  */
 static void
-setup_hw_rings(struct au1000_private *aup, u32 rx_base, u32 tx_base)
+au1000_setup_hw_rings(struct au1000_private *aup, u32 rx_base, u32 tx_base)
 {
 	int i;
 
@@ -611,7 +611,7 @@ static int au1000_init(struct net_device *dev)
 		printk("%s: au1000_init\n", dev->name);
 
 	/* bring the device out of reset */
-	enable_mac(dev, 1);
+	au1000_enable_mac(dev, 1);
 
 	spin_lock_irqsave(&aup->lock, flags);
 
@@ -650,7 +650,7 @@ static int au1000_init(struct net_device *dev)
 	return 0;
 }
 
-static inline void update_rx_stats(struct net_device *dev, u32 status)
+static inline void au1000_update_rx_stats(struct net_device *dev, u32 status)
 {
 	struct net_device_stats *ps = &dev->stats;
 
@@ -694,7 +694,7 @@ static int au1000_rx(struct net_device *dev)
 	while (buff_stat & RX_T_DONE)  {
 		status = prxd->status;
 		pDB = aup->rx_db_inuse[aup->rx_head];
-		update_rx_stats(dev, status);
+		au1000_update_rx_stats(dev, status);
 		if (!(status & RX_ERROR))  {
 
 			/* good frame */
@@ -748,7 +748,7 @@ static int au1000_rx(struct net_device *dev)
 	return 0;
 }
 
-static void update_tx_stats(struct net_device *dev, u32 status)
+static void au1000_update_tx_stats(struct net_device *dev, u32 status)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 	struct net_device_stats *ps = &dev->stats;
@@ -784,7 +784,7 @@ static void au1000_tx_ack(struct net_device *dev)
 	ptxd = aup->tx_dma_ring[aup->tx_tail];
 
 	while (ptxd->buff_stat & TX_T_DONE) {
-		update_tx_stats(dev, ptxd->status);
+		au1000_update_tx_stats(dev, ptxd->status);
 		ptxd->buff_stat &= ~TX_T_DONE;
 		ptxd->len = 0;
 		au_sync();
@@ -861,7 +861,7 @@ static int au1000_close(struct net_device *dev)
 
 	spin_lock_irqsave(&aup->lock, flags);
 
-	reset_mac_unlocked (dev);
+	au1000_reset_mac_unlocked (dev);
 
 	/* stop the device */
 	netif_stop_queue(dev);
@@ -899,7 +899,7 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_BUSY;
 	}
 	else if (buff_stat & TX_T_DONE) {
-		update_tx_stats(dev, ptxd->status);
+		au1000_update_tx_stats(dev, ptxd->status);
 		ptxd->len = 0;
 	}
 
@@ -937,7 +937,7 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev)
 static void au1000_tx_timeout(struct net_device *dev)
 {
 	printk(KERN_ERR "%s: au1000_tx_timeout: dev=%p\n", dev->name, dev);
-	reset_mac(dev);
+	au1000_reset_mac(dev);
 	au1000_init(dev);
 	dev->trans_start = jiffies;
 	netif_wake_queue(dev);
@@ -1089,9 +1089,9 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 				/* Use the hard coded MAC addresses */
 		}
 
-		setup_hw_rings(aup, MAC0_RX_DMA_ADDR, MAC0_TX_DMA_ADDR);
+		au1000_setup_hw_rings(aup, MAC0_RX_DMA_ADDR, MAC0_TX_DMA_ADDR);
 	} else if (pdev->id == 1)
-		setup_hw_rings(aup, MAC1_RX_DMA_ADDR, MAC1_TX_DMA_ADDR);
+		au1000_setup_hw_rings(aup, MAC1_RX_DMA_ADDR, MAC1_TX_DMA_ADDR);
 
 	/*
 	 * Assign to the Ethernet ports two consecutive MAC addresses
@@ -1153,7 +1153,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 		goto err_mdiobus_reg;
 	}
 
-	if (mii_probe(dev) != 0)
+	if (au1000_mii_probe(dev) != 0)
 		goto err_out;
 
 	pDBfree = NULL;
@@ -1169,7 +1169,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 	aup->pDBfree = pDBfree;
 
 	for (i = 0; i < NUM_RX_DMA; i++) {
-		pDB = GetFreeDB(aup);
+		pDB = au1000_GetFreeDB(aup);
 		if (!pDB) {
 			goto err_out;
 		}
@@ -1177,7 +1177,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 		aup->rx_db_inuse[i] = pDB;
 	}
 	for (i = 0; i < NUM_TX_DMA; i++) {
-		pDB = GetFreeDB(aup);
+		pDB = au1000_GetFreeDB(aup);
 		if (!pDB) {
 			goto err_out;
 		}
@@ -1196,7 +1196,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 	 * The boot code uses the ethernet controller, so reset it to start
 	 * fresh.  au1000_init() expects that the device is in reset state.
 	 */
-	reset_mac(dev);
+	au1000_reset_mac(dev);
 
 	err = register_netdev(dev);
 	if (err) {
@@ -1218,15 +1218,15 @@ err_out:
 
 	/* here we should have a valid dev plus aup-> register addresses
 	 * so we can reset the mac properly.*/
-	reset_mac(dev);
+	au1000_reset_mac(dev);
 
 	for (i = 0; i < NUM_RX_DMA; i++) {
 		if (aup->rx_db_inuse[i])
-			ReleaseDB(aup, aup->rx_db_inuse[i]);
+			au1000_ReleaseDB(aup, aup->rx_db_inuse[i]);
 	}
 	for (i = 0; i < NUM_TX_DMA; i++) {
 		if (aup->tx_db_inuse[i])
-			ReleaseDB(aup, aup->tx_db_inuse[i]);
+			au1000_ReleaseDB(aup, aup->tx_db_inuse[i]);
 	}
 err_mdiobus_reg:
 	mdiobus_free(aup->mii_bus);
@@ -1262,11 +1262,11 @@ static int __devexit au1000_remove(struct platform_device *pdev)
 
 	for (i = 0; i < NUM_RX_DMA; i++)
 		if (aup->rx_db_inuse[i])
-			ReleaseDB(aup, aup->rx_db_inuse[i]);
+			au1000_ReleaseDB(aup, aup->rx_db_inuse[i]);
 
 	for (i = 0; i < NUM_TX_DMA; i++)
 		if (aup->tx_db_inuse[i])
-			ReleaseDB(aup, aup->tx_db_inuse[i]);
+			au1000_ReleaseDB(aup, aup->tx_db_inuse[i]);
 
 	dma_free_noncoherent(NULL, MAX_BUF_SIZE *
 			(NUM_TX_BUFFS + NUM_RX_BUFFS),

^ permalink raw reply related

* [PATCH 4/7] au1000-eth: fix checkpatch errors.
From: Florian Fainelli @ 2010-04-07  8:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller

This patch fixes multiple errors reported by checkpatch:
- else not on the ending brace of an if { }
- multiple occurences of for( instead of for (
- c99 comments
- assignment and tests on the same line
- test and statements on the same line
- macro with complex value not between parenthesis
- static variable with initialization value

No functionnal change.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index 7d40177..49f56b4 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -298,9 +298,9 @@ au1000_adjust_link(struct net_device *dev)
 	spin_lock_irqsave(&aup->lock, flags);
 
 	if (phydev->link && (aup->old_speed != phydev->speed)) {
-		// speed changed
+		/* speed changed */
 
-		switch(phydev->speed) {
+		switch (phydev->speed) {
 		case SPEED_10:
 		case SPEED_100:
 			break;
@@ -317,7 +317,7 @@ au1000_adjust_link(struct net_device *dev)
 	}
 
 	if (phydev->link && (aup->old_duplex != phydev->duplex)) {
-		// duplex mode changed
+		/* duplex mode changed */
 
 		/* switching duplex mode requires to disable rx and tx! */
 		au1000_hard_stop(dev);
@@ -338,8 +338,8 @@ au1000_adjust_link(struct net_device *dev)
 		status_change = 1;
 	}
 
-	if(phydev->link != aup->old_link) {
-		// link state changed
+	if (phydev->link != aup->old_link) {
+		/* link state changed */
 
 		if (!phydev->link) {
 			/* link went down */
@@ -668,8 +668,7 @@ static inline void au1000_update_rx_stats(struct net_device *dev, u32 status)
 			ps->rx_crc_errors++;
 		if (status & RX_COLL)
 			ps->collisions++;
-	}
-	else
+	} else
 		ps->rx_bytes += status & RX_FRAME_LEN_MASK;
 
 }
@@ -714,8 +713,7 @@ static int au1000_rx(struct net_device *dev)
 			skb_put(skb, frmlen);
 			skb->protocol = eth_type_trans(skb, dev);
 			netif_rx(skb);	/* pass the packet to upper layers */
-		}
-		else {
+		} else {
 			if (au1000_debug > 4) {
 				if (status & RX_MISSED_FRAME)
 					printk("rx miss\n");
@@ -761,8 +759,7 @@ static void au1000_update_tx_stats(struct net_device *dev, u32 status)
 				ps->tx_errors++;
 				ps->tx_aborted_errors++;
 			}
-		}
-		else {
+		} else {
 			ps->tx_errors++;
 			ps->tx_aborted_errors++;
 			if (status & (TX_NO_CARRIER | TX_LOSS_CARRIER))
@@ -821,14 +818,16 @@ static int au1000_open(struct net_device *dev)
 	if (au1000_debug > 4)
 		printk("%s: open: dev=%p\n", dev->name, dev);
 
-	if ((retval = request_irq(dev->irq, au1000_interrupt, 0,
-					dev->name, dev))) {
+	retval = request_irq(dev->irq, au1000_interrupt, 0,
+					dev->name, dev);
+	if (retval) {
 		printk(KERN_ERR "%s: unable to get IRQ %d\n",
 				dev->name, dev->irq);
 		return retval;
 	}
 
-	if ((retval = au1000_init(dev))) {
+	retval = au1000_init(dev);
+	if (retval) {
 		printk(KERN_ERR "%s: error in au1000_init\n", dev->name);
 		free_irq(dev->irq, dev);
 		return retval;
@@ -897,8 +896,7 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev)
 		netif_stop_queue(dev);
 		aup->tx_full = 1;
 		return NETDEV_TX_BUSY;
-	}
-	else if (buff_stat & TX_T_DONE) {
+	} else if (buff_stat & TX_T_DONE) {
 		au1000_update_tx_stats(dev, ptxd->status);
 		ptxd->len = 0;
 	}
@@ -911,12 +909,11 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev)
 	pDB = aup->tx_db_inuse[aup->tx_head];
 	skb_copy_from_linear_data(skb, (void *)pDB->vaddr, skb->len);
 	if (skb->len < ETH_ZLEN) {
-		for (i=skb->len; i<ETH_ZLEN; i++) {
+		for (i = skb->len; i < ETH_ZLEN; i++) {
 			((char *)pDB->vaddr)[i] = 0;
 		}
 		ptxd->len = ETH_ZLEN;
-	}
-	else
+	} else
 		ptxd->len = skb->len;
 
 	ps->tx_packets++;
@@ -976,9 +973,11 @@ static int au1000_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 
-	if (!netif_running(dev)) return -EINVAL;
+	if (!netif_running(dev))
+		return -EINVAL;
 
-	if (!aup->phy_dev) return -EINVAL; // PHY not controllable
+	if (!aup->phy_dev)
+		return -EINVAL; /* PHY not controllable */
 
 	return phy_mii_ioctl(aup->phy_dev, if_mii(rq), cmd);
 }
@@ -997,7 +996,7 @@ static const struct net_device_ops au1000_netdev_ops = {
 
 static int __devinit au1000_probe(struct platform_device *pdev)
 {
-	static unsigned version_printed = 0;
+	static unsigned version_printed;
 	struct au1000_private *aup = NULL;
 	struct au1000_eth_platform_data *pd;
 	struct net_device *dev = NULL;
@@ -1140,7 +1139,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 	if (aup->mii_bus->irq == NULL)
 		goto err_out;
 
-	for(i = 0; i < PHY_MAX_ADDR; ++i)
+	for (i = 0; i < PHY_MAX_ADDR; ++i)
 		aup->mii_bus->irq[i] = PHY_POLL;
 	/* if known, set corresponding PHY IRQs */
 	if (aup->phy_static_config)
diff --git a/drivers/net/au1000_eth.h b/drivers/net/au1000_eth.h
index f9d29a2..344c600 100644
--- a/drivers/net/au1000_eth.h
+++ b/drivers/net/au1000_eth.h
@@ -35,7 +35,7 @@
 #define NUM_TX_BUFFS 4
 #define MAX_BUF_SIZE 2048
 
-#define ETH_TX_TIMEOUT HZ/4
+#define ETH_TX_TIMEOUT (HZ/4)
 #define MAC_MIN_PKT_SIZE 64
 
 #define MULTICAST_FILTER_LIMIT 64

^ permalink raw reply related

* [PATCH 5/7] au1000-eth: implement set/get_msglevel
From: Florian Fainelli @ 2010-04-07  8:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller

{set,get}_msglevel is required to use netif_{err,dbg} macros.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index 49f56b4..0c59d4c 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -75,6 +75,10 @@ static int au1000_debug = 5;
 static int au1000_debug = 3;
 #endif
 
+#define AU1000_DEF_MSG_ENABLE	(NETIF_MSG_DRV	| \
+				NETIF_MSG_PROBE	| \
+				NETIF_MSG_LINK)
+
 #define DRV_NAME	"au1000_eth"
 #define DRV_VERSION	"1.6"
 #define DRV_AUTHOR	"Pete Popov <ppopov@embeddedalley.com>"
@@ -583,11 +587,25 @@ au1000_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
 	info->regdump_len = 0;
 }
 
+static void au1000_set_msglevel(struct net_device *dev, u32 value)
+{
+	struct au1000_private *aup = netdev_priv(dev);
+	aup->msg_enable = value;
+}
+
+static u32 au1000_get_msglevel(struct net_device *dev)
+{
+	struct au1000_private *aup = netdev_priv(dev);
+	return aup->msg_enable;
+}
+
 static const struct ethtool_ops au1000_ethtool_ops = {
 	.get_settings = au1000_get_settings,
 	.set_settings = au1000_set_settings,
 	.get_drvinfo = au1000_get_drvinfo,
 	.get_link = ethtool_op_get_link,
+	.get_msglevel = au1000_get_msglevel,
+	.set_msglevel = au1000_set_msglevel,
 };
 
 
@@ -1050,6 +1068,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 	aup = netdev_priv(dev);
 
 	spin_lock_init(&aup->lock);
+	aup->msg_enable = (au1000_debug < 4 ? AU1000_DEF_MSG_ENABLE : au1000_debug);
 
 	/* Allocate the data buffers */
 	/* Snooping works fine with eth on all au1xxx */
diff --git a/drivers/net/au1000_eth.h b/drivers/net/au1000_eth.h
index 344c600..d06ec00 100644
--- a/drivers/net/au1000_eth.h
+++ b/drivers/net/au1000_eth.h
@@ -125,4 +125,6 @@ struct au1000_private {
 	dma_addr_t dma_addr;      /* dma address of rx/tx buffers       */
 
 	spinlock_t lock;       /* Serialise access to device */
+
+	u32 msg_enable;
 };

^ permalink raw reply related

* [PATCH 6/7] au1000-eth: Use (dev|netdev|netif)_<level> macro helpers
From: Florian Fainelli @ 2010-04-07  8:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller


Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index 0c59d4c..5afca6c 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -187,8 +187,7 @@ static int au1000_mdio_read(struct net_device *dev, int phy_addr, int reg)
 	while (*mii_control_reg & MAC_MII_BUSY) {
 		mdelay(1);
 		if (--timedout == 0) {
-			printk(KERN_ERR "%s: read_MII busy timeout!!\n",
-					dev->name);
+			netdev_err(dev, "read_MII busy timeout!!\n");
 			return -1;
 		}
 	}
@@ -202,8 +201,7 @@ static int au1000_mdio_read(struct net_device *dev, int phy_addr, int reg)
 	while (*mii_control_reg & MAC_MII_BUSY) {
 		mdelay(1);
 		if (--timedout == 0) {
-			printk(KERN_ERR "%s: mdio_read busy timeout!!\n",
-					dev->name);
+			netdev_err(dev, "mdio_read busy timeout!!\n");
 			return -1;
 		}
 	}
@@ -222,8 +220,7 @@ static void au1000_mdio_write(struct net_device *dev, int phy_addr,
 	while (*mii_control_reg & MAC_MII_BUSY) {
 		mdelay(1);
 		if (--timedout == 0) {
-			printk(KERN_ERR "%s: mdio_write busy timeout!!\n",
-					dev->name);
+			netdev_err(dev, "mdio_write busy timeout!!\n");
 			return;
 		}
 	}
@@ -270,8 +267,7 @@ static void au1000_hard_stop(struct net_device *dev)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 
-	if (au1000_debug > 4)
-		printk(KERN_INFO "%s: hard stop\n", dev->name);
+	netif_dbg(aup, drv, dev, "hard stop\n");
 
 	aup->mac->control &= ~(MAC_RX_ENABLE | MAC_TX_ENABLE);
 	au_sync_delay(10);
@@ -281,8 +277,7 @@ static void au1000_enable_rx_tx(struct net_device *dev)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 
-	if (au1000_debug > 4)
-		printk(KERN_INFO "%s: enable_rx_tx\n", dev->name);
+	netif_dbg(aup, hw, dev, "enable_rx_tx\n");
 
 	aup->mac->control |= (MAC_RX_ENABLE | MAC_TX_ENABLE);
 	au_sync_delay(10);
@@ -309,9 +304,8 @@ au1000_adjust_link(struct net_device *dev)
 		case SPEED_100:
 			break;
 		default:
-			printk(KERN_WARNING
-			       "%s: Speed (%d) is not 10/100 ???\n",
-			       dev->name, phydev->speed);
+			netdev_warn(dev, "Speed (%d) is not 10/100 ???\n",
+							phydev->speed);
 			break;
 		}
 
@@ -359,11 +353,11 @@ au1000_adjust_link(struct net_device *dev)
 
 	if (status_change) {
 		if (phydev->link)
-			printk(KERN_INFO "%s: link up (%d/%s)\n",
-			       dev->name, phydev->speed,
+			netdev_info(dev, "link up (%d/%s)\n",
+			       phydev->speed,
 			       DUPLEX_FULL == phydev->duplex ? "Full" : "Half");
 		else
-			printk(KERN_INFO "%s: link down\n", dev->name);
+			netdev_info(dev, "link down\n");
 	}
 }
 
@@ -378,8 +372,7 @@ static int au1000_mii_probe (struct net_device *dev)
 		if (aup->phy_addr)
 			phydev = aup->mii_bus->phy_map[aup->phy_addr];
 		else
-			printk (KERN_INFO DRV_NAME ":%s: using PHY-less setup\n",
-				dev->name);
+			netdev_info(dev, "using PHY-less setup\n");
 		return 0;
 	} else {
 		int phy_addr;
@@ -396,7 +389,7 @@ static int au1000_mii_probe (struct net_device *dev)
 			/* try harder to find a PHY */
 			if (!phydev && (aup->mac_id == 1)) {
 				/* no PHY found, maybe we have a dual PHY? */
-				printk (KERN_INFO DRV_NAME ": no PHY found on MAC1, "
+				dev_info(&dev->dev, ": no PHY found on MAC1, "
 					"let's see if it's attached to MAC0...\n");
 
 				/* find the first (lowest address) non-attached PHY on
@@ -422,7 +415,7 @@ static int au1000_mii_probe (struct net_device *dev)
 	}
 
 	if (!phydev) {
-		printk (KERN_ERR DRV_NAME ":%s: no PHY found\n", dev->name);
+		netdev_err(dev, "no PHY found\n");
 		return -1;
 	}
 
@@ -433,7 +426,7 @@ static int au1000_mii_probe (struct net_device *dev)
 			0, PHY_INTERFACE_MODE_MII);
 
 	if (IS_ERR(phydev)) {
-		printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name);
+		netdev_err(dev, "Could not attach to PHY\n");
 		return PTR_ERR(phydev);
 	}
 
@@ -454,8 +447,8 @@ static int au1000_mii_probe (struct net_device *dev)
 	aup->old_duplex = -1;
 	aup->phy_dev = phydev;
 
-	printk(KERN_INFO "%s: attached PHY driver [%s] "
-	       "(mii_bus:phy_addr=%s, irq=%d)\n", dev->name,
+	netdev_info(dev, "attached PHY driver [%s] "
+	       "(mii_bus:phy_addr=%s, irq=%d)\n",
 	       phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
 
 	return 0;
@@ -517,9 +510,8 @@ static void au1000_reset_mac(struct net_device *dev)
 	struct au1000_private *const aup = netdev_priv(dev);
 	unsigned long flags;
 
-	if (au1000_debug > 4)
-		printk(KERN_INFO "%s: reset mac, aup %x\n",
-		       dev->name, (unsigned)aup);
+	netif_dbg(aup, hw, dev, "reset mac, aup %x\n",
+					(unsigned)aup);
 
 	spin_lock_irqsave(&aup->lock, flags);
 
@@ -625,8 +617,7 @@ static int au1000_init(struct net_device *dev)
 	int i;
 	u32 control;
 
-	if (au1000_debug > 4)
-		printk("%s: au1000_init\n", dev->name);
+	netif_dbg(aup, hw, dev, "au1000_init\n");
 
 	/* bring the device out of reset */
 	au1000_enable_mac(dev, 1);
@@ -703,8 +694,7 @@ static int au1000_rx(struct net_device *dev)
 	db_dest_t *pDB;
 	u32	frmlen;
 
-	if (au1000_debug > 5)
-		printk("%s: au1000_rx head %d\n", dev->name, aup->rx_head);
+	netif_dbg(aup, rx_status, dev, "au1000_rx head %d\n", aup->rx_head);
 
 	prxd = aup->rx_dma_ring[aup->rx_head];
 	buff_stat = prxd->buff_stat;
@@ -719,9 +709,7 @@ static int au1000_rx(struct net_device *dev)
 			frmlen -= 4; /* Remove FCS */
 			skb = dev_alloc_skb(frmlen + 2);
 			if (skb == NULL) {
-				printk(KERN_ERR
-				       "%s: Memory squeeze, dropping packet.\n",
-				       dev->name);
+				netdev_err(dev, "Memory squeeze, dropping packet.\n");
 				dev->stats.rx_dropped++;
 				continue;
 			}
@@ -833,20 +821,18 @@ static int au1000_open(struct net_device *dev)
 	int retval;
 	struct au1000_private *aup = netdev_priv(dev);
 
-	if (au1000_debug > 4)
-		printk("%s: open: dev=%p\n", dev->name, dev);
+	netif_dbg(aup, drv, dev, "open: dev=%p\n", dev);
 
 	retval = request_irq(dev->irq, au1000_interrupt, 0,
 					dev->name, dev);
 	if (retval) {
-		printk(KERN_ERR "%s: unable to get IRQ %d\n",
-				dev->name, dev->irq);
+		netdev_err(dev, "unable to get IRQ %d\n", dev->irq);
 		return retval;
 	}
 
 	retval = au1000_init(dev);
 	if (retval) {
-		printk(KERN_ERR "%s: error in au1000_init\n", dev->name);
+		netdev_err(dev, "error in au1000_init\n");
 		free_irq(dev->irq, dev);
 		return retval;
 	}
@@ -859,8 +845,7 @@ static int au1000_open(struct net_device *dev)
 
 	netif_start_queue(dev);
 
-	if (au1000_debug > 4)
-		printk("%s: open: Initialization done.\n", dev->name);
+	netif_dbg(aup, drv, dev, "open: Initialization done.\n");
 
 	return 0;
 }
@@ -870,8 +855,7 @@ static int au1000_close(struct net_device *dev)
 	unsigned long flags;
 	struct au1000_private *const aup = netdev_priv(dev);
 
-	if (au1000_debug > 4)
-		printk("%s: close: dev=%p\n", dev->name, dev);
+	netif_dbg(aup, drv, dev, "close: dev=%p\n", dev);
 
 	if (aup->phy_dev)
 		phy_stop(aup->phy_dev);
@@ -902,9 +886,8 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev)
 	db_dest_t *pDB;
 	int i;
 
-	if (au1000_debug > 5)
-		printk("%s: tx: aup %x len=%d, data=%p, head %d\n",
-				dev->name, (unsigned)aup, skb->len,
+	netif_dbg(aup, tx_queued, dev, "tx: aup %x len=%d, data=%p, head %d\n",
+				(unsigned)aup, skb->len,
 				skb->data, aup->tx_head);
 
 	ptxd = aup->tx_dma_ring[aup->tx_head];
@@ -951,7 +934,7 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev)
  */
 static void au1000_tx_timeout(struct net_device *dev)
 {
-	printk(KERN_ERR "%s: au1000_tx_timeout: dev=%p\n", dev->name, dev);
+	netdev_err(dev, "au1000_tx_timeout: dev=%p\n", dev);
 	au1000_reset_mac(dev);
 	au1000_init(dev);
 	dev->trans_start = jiffies;
@@ -962,8 +945,7 @@ static void au1000_multicast_list(struct net_device *dev)
 {
 	struct au1000_private *aup = netdev_priv(dev);
 
-	if (au1000_debug > 4)
-		printk("%s: au1000_multicast_list: flags=%x\n", dev->name, dev->flags);
+	netif_dbg(aup, drv, dev, "au1000_multicast_list: flags=%x\n", dev->flags);
 
 	if (dev->flags & IFF_PROMISC) {			/* Set promiscuous. */
 		aup->mac->control |= MAC_PROMISCUOUS;
@@ -971,7 +953,7 @@ static void au1000_multicast_list(struct net_device *dev)
 			   netdev_mc_count(dev) > MULTICAST_FILTER_LIMIT) {
 		aup->mac->control |= MAC_PASS_ALL_MULTI;
 		aup->mac->control &= ~MAC_PROMISCUOUS;
-		printk(KERN_INFO "%s: Pass all multicast\n", dev->name);
+		netdev_info(dev, "Pass all multicast\n");
 	} else {
 		struct netdev_hw_addr *ha;
 		u32 mc_filter[2];	/* Multicast hash filter */
@@ -1025,40 +1007,40 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 
 	base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!base) {
-		printk(KERN_ERR DRV_NAME ": failed to retrieve base register\n");
+		dev_err(&pdev->dev, "failed to retrieve base register\n");
 		err = -ENODEV;
 		goto out;
 	}
 
 	macen = platform_get_resource(pdev, IORESOURCE_MEM, 1);
 	if (!macen) {
-		printk(KERN_ERR DRV_NAME ": failed to retrieve MAC Enable register\n");
+		dev_err(&pdev->dev, "failed to retrieve MAC Enable register\n");
 		err = -ENODEV;
 		goto out;
 	}
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
-		printk(KERN_ERR DRV_NAME ": failed to retrieve IRQ\n");
+		dev_err(&pdev->dev, "failed to retrieve IRQ\n");
 		err = -ENODEV;
 		goto out;
 	}
 
 	if (!request_mem_region(base->start, resource_size(base), pdev->name)) {
-		printk(KERN_ERR DRV_NAME ": failed to request memory region for base registers\n");
+		dev_err(&pdev->dev, "failed to request memory region for base registers\n");
 		err = -ENXIO;
 		goto out;
 	}
 
 	if (!request_mem_region(macen->start, resource_size(macen), pdev->name)) {
-		printk(KERN_ERR DRV_NAME ": failed to request memory region for MAC enable register\n");
+		dev_err(&pdev->dev, "failed to request memory region for MAC enable register\n");
 		err = -ENXIO;
 		goto err_request;
 	}
 
 	dev = alloc_etherdev(sizeof(struct au1000_private));
 	if (!dev) {
-		printk(KERN_ERR "%s: alloc_etherdev failed\n", DRV_NAME);
+		dev_err(&pdev->dev, "alloc_etherdev failed\n");
 		err = -ENOMEM;
 		goto err_alloc;
 	}
@@ -1076,7 +1058,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 						(NUM_TX_BUFFS + NUM_RX_BUFFS),
 						&aup->dma_addr,	0);
 	if (!aup->vaddr) {
-		printk(KERN_ERR DRV_NAME ": failed to allocate data buffers\n");
+		dev_err(&pdev->dev, "failed to allocate data buffers\n");
 		err = -ENOMEM;
 		goto err_vaddr;
 	}
@@ -1084,7 +1066,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 	/* aup->mac is the base address of the MAC's registers */
 	aup->mac = (volatile mac_reg_t *)ioremap_nocache(base->start, resource_size(base));
 	if (!aup->mac) {
-		printk(KERN_ERR DRV_NAME ": failed to ioremap MAC registers\n");
+		dev_err(&pdev->dev, "failed to ioremap MAC registers\n");
 		err = -ENXIO;
 		goto err_remap1;
 	}
@@ -1092,7 +1074,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
         /* Setup some variables for quick register address access */
 	aup->enable = (volatile u32 *)ioremap_nocache(macen->start, resource_size(macen));
 	if (!aup->enable) {
-		printk(KERN_ERR DRV_NAME ": failed to ioremap MAC enable register\n");
+		dev_err(&pdev->dev, "failed to ioremap MAC enable register\n");
 		err = -ENXIO;
 		goto err_remap2;
 	}
@@ -1102,8 +1084,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 		if (prom_get_ethernet_addr(ethaddr) == 0)
 			memcpy(au1000_mac_addr, ethaddr, sizeof(au1000_mac_addr));
 		else {
-			printk(KERN_INFO "%s: No MAC address found\n",
-					 dev->name);
+			netdev_info(dev, "No MAC address found\n");
 				/* Use the hard coded MAC addresses */
 		}
 
@@ -1123,7 +1104,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 
 	pd = pdev->dev.platform_data;
 	if (!pd) {
-		printk(KERN_INFO DRV_NAME ": no platform_data passed, PHY search on MAC0\n");
+		dev_info(&pdev->dev, "no platform_data passed, PHY search on MAC0\n");
 		aup->phy1_search_mac0 = 1;
 	} else {
 		aup->phy_static_config = pd->phy_static_config;
@@ -1135,7 +1116,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 	}
 
 	if (aup->phy_busid && aup->phy_busid > 0) {
-		printk(KERN_ERR DRV_NAME ": MAC0-associated PHY attached 2nd MACs MII"
+		dev_err(&pdev->dev, "MAC0-associated PHY attached 2nd MACs MII"
 				"bus not supported yet\n");
 		err = -ENODEV;
 		goto err_mdiobus_alloc;
@@ -1143,7 +1124,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 
 	aup->mii_bus = mdiobus_alloc();
 	if (aup->mii_bus == NULL) {
-		printk(KERN_ERR DRV_NAME ": failed to allocate mdiobus structure\n");
+		dev_err(&pdev->dev, "failed to allocate mdiobus structure\n");
 		err = -ENOMEM;
 		goto err_mdiobus_alloc;
 	}
@@ -1167,7 +1148,7 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 
 	err = mdiobus_register(aup->mii_bus);
 	if (err) {
-		printk(KERN_ERR DRV_NAME " failed to register MDIO bus\n");
+		dev_err(&pdev->dev, "failed to register MDIO bus\n");
 		goto err_mdiobus_reg;
 	}
 
@@ -1218,13 +1199,12 @@ static int __devinit au1000_probe(struct platform_device *pdev)
 
 	err = register_netdev(dev);
 	if (err) {
-		printk(KERN_ERR DRV_NAME "%s: Cannot register net device, aborting.\n",
-					dev->name);
+		netdev_err(dev, "Cannot register net device, aborting.\n");
 		goto err_out;
 	}
 
-	printk("%s: Au1xx0 Ethernet found at 0x%lx, irq %d\n",
-			dev->name, (unsigned long)base->start, irq);
+	netdev_info(dev, "Au1xx0 Ethernet found at 0x%lx, irq %d\n",
+			(unsigned long)base->start, irq);
 	if (version_printed++ == 0)
 		printk("%s version %s %s\n", DRV_NAME, DRV_VERSION, DRV_AUTHOR);
 

^ permalink raw reply related

* [PATCH 7/7] au1000-eth: bump to 1.7
From: Florian Fainelli @ 2010-04-07  8:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller


Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index 5afca6c..7abb2c8 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -80,7 +80,7 @@ static int au1000_debug = 3;
 				NETIF_MSG_LINK)
 
 #define DRV_NAME	"au1000_eth"
-#define DRV_VERSION	"1.6"
+#define DRV_VERSION	"1.7"
 #define DRV_AUTHOR	"Pete Popov <ppopov@embeddedalley.com>"
 #define DRV_DESC	"Au1xxx on-chip Ethernet driver"
 

^ permalink raw reply related

* Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.
From: Michael S. Tsirkin @ 2010-04-07  8:18 UTC (permalink / raw)
  To: Xin, Xiaohui
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu, jdike@addtoit.com
In-Reply-To: <97F6D3BD476C464182C1B7BABF0B0AF5C17B5EB2@shzsmsx502.ccr.corp.intel.com>

On Wed, Apr 07, 2010 at 09:36:36AM +0800, Xin, Xiaohui wrote:
> Michael,
> > > >>>> For the write logging, do you have a function in hand that we can
> > > >>>> recompute the log? If that, I think I can use it to recompute the
> > > >>>>log info when the logging is suddenly enabled.
> > > >>>> For the outstanding requests, do you mean all the user buffers have
> > > >>>>submitted before the logging ioctl changed? That may be a lot, and
> > > >> >>some of them are still in NIC ring descriptors. Waiting them to be
> > > >>>>finished may be need some time. I think when logging ioctl changed,
> > > >> >>then the logging is changed just after that is also reasonable.
> 
> > > >>>The key point is that after loggin ioctl returns, any
> > > >>>subsequent change to memory must be logged. It does not
> > > >>>matter when was the request submitted, otherwise we will
> > > >>>get memory corruption on migration.
> 
> > > >>The change to memory happens when vhost_add_used_and_signal(), right?
> > > >>So after ioctl returns, just recompute the log info to the events in the async queue,
> > > >>is ok. Since the ioctl and write log operations are all protected by vq->mutex.
> 
> > >>> Thanks
> > >> >Xiaohui
> 
> > >>Yes, I think this will work.
> 
> >> Thanks, so do you have the function to recompute the log info in your hand that I can
> >>use? I have weakly remembered that you have noticed it before some time.
> 
> >Doesn't just rerunning vhost_get_vq_desc work?
> 
> Am I missing something here?
> The vhost_get_vq_desc() looks in vq, and finds the first available buffers, and converts it
> to an iovec. I think the first available buffer is not the buffers in the async queue, so I
> think rerunning vhost_get_vq_desc() cannot work.
> 
> Thanks
> Xiaohui

Right, but we can move the head back, so we'll find the same buffers
again, or add a variant of vhost_get_vq_desc that will process
descriptors already consumed.

> > > > Thanks
> > > > Xiaohui
> > > >
> > > >  drivers/vhost/net.c   |  189 +++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  drivers/vhost/vhost.h |   10 +++
> > > >  2 files changed, 192 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > index 22d5fef..2aafd90 100644
> > > > --- a/drivers/vhost/net.c
> > > > +++ b/drivers/vhost/net.c
> > > > @@ -17,11 +17,13 @@
> > > >  #include <linux/workqueue.h>
> > > >  #include <linux/rcupdate.h>
> > > >  #include <linux/file.h>
> > > > +#include <linux/aio.h>
> > > >
> > > >  #include <linux/net.h>
> > > >  #include <linux/if_packet.h>
> > > >  #include <linux/if_arp.h>
> > > >  #include <linux/if_tun.h>
> > > > +#include <linux/mpassthru.h>
> > > >
> > > >  #include <net/sock.h>
> > > >
> > > > @@ -47,6 +49,7 @@ struct vhost_net {
> > > >   struct vhost_dev dev;
> > > >   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > > >   struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > > + struct kmem_cache       *cache;
> > > >   /* Tells us whether we are polling a socket for TX.
> > > >    * We only do this when socket buffer fills up.
> > > >    * Protected by tx vq lock. */
> > > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> > > >   net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > > >  }
> > > >
> > > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + unsigned long flags;
> > > > +
> > > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > > + if (!list_empty(&vq->notifier)) {
> > > > +         iocb = list_first_entry(&vq->notifier,
> > > > +                         struct kiocb, ki_list);
> > > > +         list_del(&iocb->ki_list);
> > > > + }
> > > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > > + return iocb;
> > > > +}
> > > > +
> > > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > > +                                 struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + struct vhost_log *vq_log = NULL;
> > > > + int rx_total_len = 0;
> > > > + int log, size;
> > > > +
> > > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > > +         return;
> > > > +
> > > > + if (vq->receiver)
> > > > +         vq->receiver(vq);
> > > > +
> > > > + vq_log = unlikely(vhost_has_feature(
> > > > +                         &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > > +         vhost_add_used_and_signal(&net->dev, vq,
> > > > +                         iocb->ki_pos, iocb->ki_nbytes);
> > > > +         log = (int)iocb->ki_user_data;
> > > > +         size = iocb->ki_nbytes;
> > > > +         rx_total_len += iocb->ki_nbytes;
> > > > +
> > > > +         if (iocb->ki_dtor)
> > > > +                 iocb->ki_dtor(iocb);
> > > > +         kmem_cache_free(net->cache, iocb);
> > > > +
> > > > +         if (unlikely(vq_log))
> > > > +                 vhost_log_write(vq, vq_log, log, size);
> > > > +         if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > > +                 vhost_poll_queue(&vq->poll);
> > > > +                 break;
> > > > +         }
> > > > + }
> > > > +}
> > > > +
> > > > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > > > +                                 struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + int tx_total_len = 0;
> > > > +
> > > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > > +         return;
> > > > +
> > > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > > +         vhost_add_used_and_signal(&net->dev, vq,
> > > > +                         iocb->ki_pos, 0);
> > > > +         tx_total_len += iocb->ki_nbytes;
> > > > +
> > > > +         if (iocb->ki_dtor)
> > > > +                 iocb->ki_dtor(iocb);
> > > > +
> > > > +         kmem_cache_free(net->cache, iocb);
> > > > +         if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > > > +                 vhost_poll_queue(&vq->poll);
> > > > +                 break;
> > > > +         }
> > > > + }
> > > > +}
> > > > +
> > > >  /* Expects to be always run from workqueue - which acts as
> > > >   * read-size critical section for our kind of RCU. */
> > > >  static void handle_tx(struct vhost_net *net)
> > > >  {
> > > >   struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> > > > + struct kiocb *iocb = NULL;
> > > >   unsigned head, out, in, s;
> > > >   struct msghdr msg = {
> > > >           .msg_name = NULL,
> > > > @@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
> > > >           tx_poll_stop(net);
> > > >   hdr_size = vq->hdr_size;
> > > >
> > > > + handle_async_tx_events_notify(net, vq);
> > > > +
> > > >   for (;;) {
> > > >           head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > >                                    ARRAY_SIZE(vq->iov),
> > > > @@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
> > > >           /* Skip header. TODO: support TSO. */
> > > >           s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> > > >           msg.msg_iovlen = out;
> > > > +
> > > > +         if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > > +                 iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > > +                 if (!iocb)
> > > > +                         break;
> > > > +                 iocb->ki_pos = head;
> > > > +                 iocb->private = (void *)vq;
> > > > +         }
> > > > +
> > > >           len = iov_length(vq->iov, out);
> > > >           /* Sanity check */
> > > >           if (!len) {
> > > > @@ -160,12 +251,16 @@ static void handle_tx(struct vhost_net *net)
> > > >                   break;
> > > >           }
> > > >           /* TODO: Check specific error and bomb out unless ENOBUFS? */
> > > > -         err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > > > +         err = sock->ops->sendmsg(iocb, sock, &msg, len);
> > > >           if (unlikely(err < 0)) {
> > > >                   vhost_discard_vq_desc(vq);
> > > >                   tx_poll_start(net, sock);
> > > >                   break;
> > > >           }
> > > > +
> > > > +         if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > > +                 continue;
> > > > +
> > > >           if (err != len)
> > > >                   pr_err("Truncated TX packet: "
> > > >                          " len %d != %zd\n", err, len);
> > > > @@ -177,6 +272,8 @@ static void handle_tx(struct vhost_net *net)
> > > >           }
> > > >   }
> > > >
> > > > + handle_async_tx_events_notify(net, vq);
> > > > +
> > > >   mutex_unlock(&vq->mutex);
> > > >   unuse_mm(net->dev.mm);
> > > >  }
> > > > @@ -186,6 +283,7 @@ static void handle_tx(struct vhost_net *net)
> > > >  static void handle_rx(struct vhost_net *net)
> > > >  {
> > > >   struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > > > + struct kiocb *iocb = NULL;
> > > >   unsigned head, out, in, log, s;
> > > >   struct vhost_log *vq_log;
> > > >   struct msghdr msg = {
> > > > @@ -206,7 +304,8 @@ static void handle_rx(struct vhost_net *net)
> > > >   int err;
> > > >   size_t hdr_size;
> > > >   struct socket *sock = rcu_dereference(vq->private_data);
> > > > - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > > > + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> > > > +                 vq->link_state == VHOST_VQ_LINK_SYNC))
> > > >           return;
> > > >
> > > >   use_mm(net->dev.mm);
> > > > @@ -214,9 +313,18 @@ static void handle_rx(struct vhost_net *net)
> > > >   vhost_disable_notify(vq);
> > > >   hdr_size = vq->hdr_size;
> > > >
> > > > - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > > > + /* In async cases, for write logging, the simple way is to get
> > > > +  * the log info always, and really logging is decided later.
> > > > +  * Thus, when logging enabled, we can get log, and when logging
> > > > +  * disabled, we can get log disabled accordingly.
> > > > +  */
> > > > +
> > > > + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
> > > > +         (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
> > > >           vq->log : NULL;
> > > >
> > > > + handle_async_rx_events_notify(net, vq);
> > > > +
> > > >   for (;;) {
> > > >           head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > >                                    ARRAY_SIZE(vq->iov),
> > > > @@ -245,6 +353,14 @@ static void handle_rx(struct vhost_net *net)
> > > >           s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > > >           msg.msg_iovlen = in;
> > > >           len = iov_length(vq->iov, in);
> > > > +         if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > > +                 iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > > +                 if (!iocb)
> > > > +                         break;
> > > > +                 iocb->private = vq;
> > > > +                 iocb->ki_pos = head;
> > > > +                 iocb->ki_user_data = log;
> > > > +         }
> > > >           /* Sanity check */
> > > >           if (!len) {
> > > >                   vq_err(vq, "Unexpected header len for RX: "
> > > > @@ -252,13 +368,18 @@ static void handle_rx(struct vhost_net *net)
> > > >                          iov_length(vq->hdr, s), hdr_size);
> > > >                   break;
> > > >           }
> > > > -         err = sock->ops->recvmsg(NULL, sock, &msg,
> > > > +
> > > > +         err = sock->ops->recvmsg(iocb, sock, &msg,
> > > >                                    len, MSG_DONTWAIT | MSG_TRUNC);
> > > >           /* TODO: Check specific error and bomb out unless EAGAIN? */
> > > >           if (err < 0) {
> > > >                   vhost_discard_vq_desc(vq);
> > > >                   break;
> > > >           }
> > > > +
> > > > +         if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > > +                 continue;
> > > > +
> > > >           /* TODO: Should check and handle checksum. */
> > > >           if (err > len) {
> > > >                   pr_err("Discarded truncated rx packet: "
> > > > @@ -284,10 +405,13 @@ static void handle_rx(struct vhost_net *net)
> > > >           }
> > > >   }
> > > >
> > > > + handle_async_rx_events_notify(net, vq);
> > > > +
> > > >   mutex_unlock(&vq->mutex);
> > > >   unuse_mm(net->dev.mm);
> > > >  }
> > > >
> > > > +
> > > >  static void handle_tx_kick(struct work_struct *work)
> > > >  {
> > > >   struct vhost_virtqueue *vq;
> > > > @@ -338,6 +462,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> > > >   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> > > >   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> > > >   n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> > > > + n->cache = NULL;
> > > >   return 0;
> > > >  }
> > > >
> > > > @@ -398,6 +523,17 @@ static void vhost_net_flush(struct vhost_net *n)
> > > >   vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
> > > >  }
> > > >
> > > > +static void vhost_notifier_cleanup(struct vhost_net *n)
> > > > +{
> > > > + struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> > > > + struct kiocb *iocb = NULL;
> > > > + if (n->cache) {
> > > > +         while ((iocb = notify_dequeue(vq)) != NULL)
> > > > +                 kmem_cache_free(n->cache, iocb);
> > > > +         kmem_cache_destroy(n->cache);
> > > > + }
> > > > +}
> > > > +
> > > >  static int vhost_net_release(struct inode *inode, struct file *f)
> > > >  {
> > > >   struct vhost_net *n = f->private_data;
> > > > @@ -414,6 +550,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> > > >   /* We do an extra flush before freeing memory,
> > > >    * since jobs can re-queue themselves. */
> > > >   vhost_net_flush(n);
> > > > + vhost_notifier_cleanup(n);
> > > >   kfree(n);
> > > >   return 0;
> > > >  }
> > > > @@ -462,7 +599,19 @@ static struct socket *get_tun_socket(int fd)
> > > >   return sock;
> > > >  }
> > > >
> > > > -static struct socket *get_socket(int fd)
> > > > +static struct socket *get_mp_socket(int fd)
> > > > +{
> > > > + struct file *file = fget(fd);
> > > > + struct socket *sock;
> > > > + if (!file)
> > > > +         return ERR_PTR(-EBADF);
> > > > + sock = mp_get_socket(file);
> > > > + if (IS_ERR(sock))
> > > > +         fput(file);
> > > > + return sock;
> > > > +}
> > > > +
> > > > +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
> > > >  {
> > > >   struct socket *sock;
> > > >   if (fd == -1)
> > > > @@ -473,9 +622,31 @@ static struct socket *get_socket(int fd)
> > > >   sock = get_tun_socket(fd);
> > > >   if (!IS_ERR(sock))
> > > >           return sock;
> > > > + sock = get_mp_socket(fd);
> > > > + if (!IS_ERR(sock)) {
> > > > +         vq->link_state = VHOST_VQ_LINK_ASYNC;
> > > > +         return sock;
> > > > + }
> > > >   return ERR_PTR(-ENOTSOCK);
> > > >  }
> > > >
> > > > +static void vhost_init_link_state(struct vhost_net *n, int index)
> > > > +{
> > > > + struct vhost_virtqueue *vq = n->vqs + index;
> > > > +
> > > > + WARN_ON(!mutex_is_locked(&vq->mutex));
> > > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > > +         vq->receiver = NULL;
> > > > +         INIT_LIST_HEAD(&vq->notifier);
> > > > +         spin_lock_init(&vq->notify_lock);
> > > > +         if (!n->cache) {
> > > > +                 n->cache = kmem_cache_create("vhost_kiocb",
> > > > +                                 sizeof(struct kiocb), 0,
> > > > +                                 SLAB_HWCACHE_ALIGN, NULL);
> > > > +         }
> > > > + }
> > > > +}
> > > > +
> > > >  static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > >  {
> > > >   struct socket *sock, *oldsock;
> > > > @@ -493,12 +664,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > >   }
> > > >   vq = n->vqs + index;
> > > >   mutex_lock(&vq->mutex);
> > > > - sock = get_socket(fd);
> > > > + vq->link_state = VHOST_VQ_LINK_SYNC;
> > > > + sock = get_socket(vq, fd);
> > > >   if (IS_ERR(sock)) {
> > > >           r = PTR_ERR(sock);
> > > >           goto err;
> > > >   }
> > > >
> > > > + vhost_init_link_state(n, index);
> > > > +
> > > >   /* start polling new socket */
> > > >   oldsock = vq->private_data;
> > > >   if (sock == oldsock)
> > > > @@ -507,8 +681,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > >   vhost_net_disable_vq(n, vq);
> > > >   rcu_assign_pointer(vq->private_data, sock);
> > > >   vhost_net_enable_vq(n, vq);
> > > > - mutex_unlock(&vq->mutex);
> > > >  done:
> > > > + mutex_unlock(&vq->mutex);
> > > >   mutex_unlock(&n->dev.mutex);
> > > >   if (oldsock) {
> > > >           vhost_net_flush_vq(n, index);
> > > > @@ -516,6 +690,7 @@ done:
> > > >   }
> > > >   return r;
> > > >  err:
> > > > + mutex_unlock(&vq->mutex);
> > > >   mutex_unlock(&n->dev.mutex);
> > > >   return r;
> > > >  }
> > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > index d1f0453..cffe39a 100644
> > > > --- a/drivers/vhost/vhost.h
> > > > +++ b/drivers/vhost/vhost.h
> > > > @@ -43,6 +43,11 @@ struct vhost_log {
> > > >   u64 len;
> > > >  };
> > > >
> > > > +enum vhost_vq_link_state {
> > > > + VHOST_VQ_LINK_SYNC =    0,
> > > > + VHOST_VQ_LINK_ASYNC =   1,
> > > > +};
> > > > +
> > > >  /* The virtqueue structure describes a queue attached to a device. */
> > > >  struct vhost_virtqueue {
> > > >   struct vhost_dev *dev;
> > > > @@ -96,6 +101,11 @@ struct vhost_virtqueue {
> > > >   /* Log write descriptors */
> > > >   void __user *log_base;
> > > >   struct vhost_log log[VHOST_NET_MAX_SG];
> > > > + /*Differiate async socket for 0-copy from normal*/
> > > > + enum vhost_vq_link_state link_state;
> > > > + struct list_head notifier;
> > > > + spinlock_t notify_lock;
> > > > + void (*receiver)(struct vhost_virtqueue *);
> > > >  };
> > > >
> > > >  struct vhost_dev {
> > > > --
> > > > 1.5.4.4

^ permalink raw reply

* Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-07  8:15 UTC (permalink / raw)
  To: Xin, Xiaohui
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu,
	jdike@c2.user-mode-linux.org, yzhao81@gmail.com
In-Reply-To: <97F6D3BD476C464182C1B7BABF0B0AF5C183234A@shzsmsx502.ccr.corp.intel.com>

On Wed, Apr 07, 2010 at 10:41:08AM +0800, Xin, Xiaohui wrote:
> Michael,
>  
> >> Qemu needs a userspace write, is that a synchronous one or
> >>asynchronous one?
> 
> >It's a synchronous non-blocking write.
> Sorry, why the Qemu live migration needs the device have a userspace write?
> how does the write operation work? And why a read operation is not cared here?
> 
> Thanks
> Xiaohui

Roughly, with ethernet bridges, moving a device from one location in
the network to another makes forwarding tables incorrect (or incomplete),
until outgoing traffic from the device causes these tables
to be updated. Since there's no guarantee that guest
will generate outgoing traffic, after migration qemu sends out several
dummy packets itself.

-- 
MST

^ permalink raw reply

* Re: patch to improve x.25 throughput negotiation
From: John Hughes @ 2010-04-07  8:39 UTC (permalink / raw)
  To: andrew hendry; +Cc: netdev
In-Reply-To: <v2jd45a3acc1004060509u88f32fa1va0c374154806535b@mail.gmail.com>

andrew hendry wrote:
> I have reproduced a few ways.
> 1. X25_MASK_THROUGHPUT on the x25_subscript_struct, then call
> SIOCX25SSUBSCRIP, then call SIOCX25FACILITIES without setting the
> throughput field. Call connect.
> 2. No subscrip setting, call SIOCX25FACILITIES without setting the
> throughput field. Call connect.
> 3. No subcrip, no facilities ioctl, call connect.
>
> The patch removes the bad facility and makes the router accept the
> call for the above cases.
> I don't currently have a setup to test both direction throughput negotiation.
>   
Summary: works for you (as far as you were able to test it).

Great news!


^ permalink raw reply

* Re:[PATCH 1/3] A device for zero-copy based on KVM virtio-net.
From: xiaohui.xin @ 2010-04-07  9:00 UTC (permalink / raw)
  To: mst; +Cc: netdev, kvm, linux-kernel, mingo, jdike, Xin Xiaohui
In-Reply-To: <20100407081532.GB9550@redhat.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

---

Michael,
Thanks a lot for the explanation. I have drafted a patch for the qemu write
after I looked into tun driver. Does it do in right way?

Thanks
Xiaohui

 drivers/vhost/mpassthru.c |   45 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index e9449ac..1cde097 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
 	return mask;
 }
 
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct mp_struct *mp = mp_get(file->private_data);
+	struct sock *sk = mp->socket.sk;
+	struct sk_buff *skb;
+	int len, err;
+	ssize_t result;
+
+	if (!mp)
+		return -EBADFD;
+
+	/* currently, async is not supported */
+	if (!is_sync_kiocb(iocb))
+		return -EFAULT;
+
+	len = iov_length(iov, count);
+	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+				  file->f_flags & O_NONBLOCK, &err);
+
+	if (!skb)
+		return -EFAULT;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb_put(skb, len);
+
+	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+		kfree_skb(skb);
+		return -EFAULT;
+	}
+	skb_set_network_header(skb, ETH_HLEN);
+	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+	skb->dev = mp->dev;
+
+	dev_queue_xmit(skb);
+	mp->dev->stats.tx_packets++;
+	mp->dev->stats.tx_bytes += len;
+
+	mp_put(mp);
+	return result;
+}
+
 static int mp_chr_close(struct inode *inode, struct file *file)
 {
 	struct mp_file *mfile = file->private_data;
@@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct file *file)
 static const struct file_operations mp_fops = {
 	.owner  = THIS_MODULE,
 	.llseek = no_llseek,
+	.write  = do_sync_write,
+	.aio_write = mp_chr_aio_write,
 	.poll   = mp_chr_poll,
 	.unlocked_ioctl = mp_chr_ioctl,
 	.open   = mp_chr_open,
-- 
1.5.4.4


^ permalink raw reply related

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Zhang, Yanmin @ 2010-04-07  9:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, netdev, Tejun Heo, Pekka Enberg, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <1270622352.2091.702.camel@edumazet-laptop>

On Wed, 2010-04-07 at 08:39 +0200, Eric Dumazet wrote:
> Le mercredi 07 avril 2010 à 10:34 +0800, Zhang, Yanmin a écrit :
> 
> > I collected retired instruction, dtlb miss and LLC miss.
> > Below is data of LLC miss.
> > 
> > Kernel 2.6.33:
> > # Samples: 11639436896 LLC-load-misses
> > #
> > # Overhead          Command                                           Shared Object  Symbol
> > # ........  ...............  ......................................................  ......
> > #
> >     20.94%        hackbench  [kernel.kallsyms]                                       [k] copy_user_generic_string
> >     14.56%        hackbench  [kernel.kallsyms]                                       [k] unix_stream_recvmsg
> >     12.88%        hackbench  [kernel.kallsyms]                                       [k] kfree
> >      7.37%        hackbench  [kernel.kallsyms]                                       [k] kmem_cache_free
> >      7.18%        hackbench  [kernel.kallsyms]                                       [k] kmem_cache_alloc_node
> >      6.78%        hackbench  [kernel.kallsyms]                                       [k] kfree_skb
> >      6.27%        hackbench  [kernel.kallsyms]                                       [k] __kmalloc_node_track_caller
> >      2.73%        hackbench  [kernel.kallsyms]                                       [k] __slab_free
> >      2.21%        hackbench  [kernel.kallsyms]                                       [k] get_partial_node
> >      2.01%        hackbench  [kernel.kallsyms]                                       [k] _raw_spin_lock
> >      1.59%        hackbench  [kernel.kallsyms]                                       [k] schedule
> >      1.27%        hackbench  hackbench                                               [.] receiver
> >      0.99%        hackbench  libpthread-2.9.so                                       [.] __read
> >      0.87%        hackbench  [kernel.kallsyms]                                       [k] unix_stream_sendmsg
> > 
> > 
> > 
> > 
> > Kernel 2.6.34-rc3:
> > # Samples: 13079611308 LLC-load-misses
> > #
> > # Overhead          Command                                                         Shared Object  Symbol
> > # ........  ...............  ....................................................................  ......
> > #
> >     18.55%        hackbench  [kernel.kallsyms]                                                     [k] copy_user_generic_str
> > ing
> >     13.19%        hackbench  [kernel.kallsyms]                                                     [k] unix_stream_recvmsg
> >     11.62%        hackbench  [kernel.kallsyms]                                                     [k] kfree
> >      8.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_free
> >      7.88%        hackbench  [kernel.kallsyms]                                                     [k] __kmalloc_node_track_
> > caller
> >      6.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_alloc_node
> >      5.94%        hackbench  [kernel.kallsyms]                                                     [k] kfree_skb
> >      3.48%        hackbench  [kernel.kallsyms]                                                     [k] __slab_free
> >      2.15%        hackbench  [kernel.kallsyms]                                                     [k] _raw_spin_lock
> >      1.83%        hackbench  [kernel.kallsyms]                                                     [k] schedule
> >      1.82%        hackbench  [kernel.kallsyms]                                                     [k] get_partial_node
> >      1.59%        hackbench  hackbench                                                             [.] receiver
> >      1.37%        hackbench  libpthread-2.9.so                                                     [.] __read
> > 
> > 
> 
> Please check values of /proc/sys/net/core/rmem_default
> and /proc/sys/net/core/wmem_default on your machines.
> 
> Their values can also change hackbench results, because increasing
> wmem_default allows af_unix senders to consume much more skbs and stress
> slab allocators (__slab_free), way beyond slub_min_order can tune them.
> 
> When 2000 senders are running (and 2000 receivers), we might consume
> something like 2000 * 100.000 bytes of kernel memory for skbs. TLB
> trashing is expected, because all these skbs can span many 2MB pages.
> Maybe some node imbalance happens too.
It's a good pointer. rmem_default and wmem_default are about 116k on my machine.
I changed them to 52K and it seems there is no improvement.

> 
> 
> 
> You could try to boot your machine with less ram per node and check :
> 
> # cat /proc/buddyinfo 
> Node 0, zone      DMA      2      1      2      2      1      1      1      0      1      1      3 
> Node 0, zone    DMA32    219    298    143    584    145     57     44     41     31     26    517 
> Node 1, zone    DMA32      4      1     17      1      0      3      2      2      2      2    123 
> Node 1, zone   Normal    126    169     83      8      7      5     59     59     49     28    459 
> 
> 
> One experiment on your Nehalem machine would be to change hackbench so
> that each group (20 senders/ 20 receivers) run on a particular NUMA
> node.
I expect process scheduler to work well in scheduling different groups
to different nodes.

I suspected dynamic percpu data didn't take care of NUMA, but kernel dump shows
it does take care of NUMA.

> 
> x86info -c ->
> 
> CPU #1
> EFamily: 0 EModel: 1 Family: 6 Model: 26 Stepping: 5
> CPU Model: Core i7 (Nehalem)
> Processor name string: Intel(R) Xeon(R) CPU           X5570  @ 2.93GHz
> Type: 0 (Original OEM)	Brand: 0 (Unsupported)
> Number of cores per physical package=8
> Number of logical processors per socket=16
> Number of logical processors per core=2
> APIC ID: 0x10	Package: 0  Core: 1   SMT ID 0
> Cache info
>  L1 Instruction cache: 32KB, 4-way associative. 64 byte line size.
>  L1 Data cache: 32KB, 8-way associative. 64 byte line size.
>  L2 (MLC): 256KB, 8-way associative. 64 byte line size.
> TLB info
>  Data TLB: 4KB pages, 4-way associative, 64 entries
>  64 byte prefetching.
> Found unknown cache descriptors: 55 5a b2 ca e4 
> 
> 



^ permalink raw reply

* Re: [PATCH 0/7] au1000-eth cleanups
From: David Miller @ 2010-04-07  9:09 UTC (permalink / raw)
  To: florian; +Cc: netdev
In-Reply-To: <201004071008.49457.florian@openwrt.org>

From: Florian Fainelli <florian@openwrt.org>
Date: Wed, 7 Apr 2010 10:08:49 +0200

> Hi David,
> 
> Here comes a serie of 7 cleanups and small enhancements for the Au1000 
> ethernet driver.

All applied to net-next-2.6, thanks Florian.

^ permalink raw reply

* Re: [PATCH 1/4] flow: virtualize flow cache entry methods
From: David Miller @ 2010-04-07  9:15 UTC (permalink / raw)
  To: timo.teras; +Cc: herbert, netdev
In-Reply-To: <4BBB3684.8070807@iki.fi>

From: Timo Teräs <timo.teras@iki.fi>
Date: Tue, 06 Apr 2010 16:26:28 +0300

> As noticed in review of 2/4, this needs to be fixed by calling
> flow object delete() unconditionally if genid is outdated. I'll
> repost with this fixed for next iteration.

I went over the current versions of these patches and
the general scheme looks fine to me.

Please resubmit a new set of patches with the final fixes to the genid
delete handlign.

Thanks a lot for doing this work!

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Eric Dumazet @ 2010-04-07  9:20 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: Christoph Lameter, netdev, Tejun Heo, Pekka Enberg, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <1270631267.2078.380.camel@ymzhang.sh.intel.com>

Le mercredi 07 avril 2010 à 17:07 +0800, Zhang, Yanmin a écrit :
> > 
> > One experiment on your Nehalem machine would be to change hackbench so
> > that each group (20 senders/ 20 receivers) run on a particular NUMA
> > node.
> I expect process scheduler to work well in scheduling different groups
> to different nodes.
> 
> I suspected dynamic percpu data didn't take care of NUMA, but kernel dump shows
> it does take care of NUMA.
> 

hackbench allocates all unix sockets on one single node, then
forks/spans its children.

Thats huge node imbalance.

You can see this with lsof on a running hackbench :


# lsof -p 14802
COMMAND     PID USER   FD   TYPE             DEVICE    SIZE     NODE NAME
hackbench 14802 root  cwd    DIR              104,7    4096 12927240 /data/src/linux-2.6
hackbench 14802 root  rtd    DIR              104,2    4096        2 /
hackbench 14802 root  txt    REG              104,2   17524   697317 /usr/bin/hackbench
hackbench 14802 root  mem    REG              104,2  112212   558042 /lib/ld-2.3.4.so
hackbench 14802 root  mem    REG              104,2 1547588   558043 /lib/tls/libc-2.3.4.so
hackbench 14802 root  mem    REG              104,2  107928   557058 /lib/tls/libpthread-2.3.4.so
hackbench 14802 root  mem    REG                0,0                0 [heap] (stat: No such file or directory)
hackbench 14802 root    0u   CHR              136,0                3 /dev/pts/0
hackbench 14802 root    1u   CHR              136,0                3 /dev/pts/0
hackbench 14802 root    2u   CHR              136,0                3 /dev/pts/0
hackbench 14802 root    3u  unix 0xffff8800ac0da100            28939 socket
hackbench 14802 root    4u  unix 0xffff8800ac0da400            28940 socket
hackbench 14802 root    5u  unix 0xffff8800ac0da700            28941 socket
hackbench 14802 root    6u  unix 0xffff8800ac0daa00            28942 socket
hackbench 14802 root    8u  unix 0xffff8800aeac1800            28984 socket
hackbench 14802 root    9u  unix 0xffff8800aeac1e00            28986 socket
hackbench 14802 root   10u  unix 0xffff8800aeac2400            28988 socket
hackbench 14802 root   11u  unix 0xffff8800aeac2a00            28990 socket
hackbench 14802 root   12u  unix 0xffff8800aeac3000            28992 socket
hackbench 14802 root   13u  unix 0xffff8800aeac3600            28994 socket
hackbench 14802 root   14u  unix 0xffff8800aeac3c00            28996 socket
hackbench 14802 root   15u  unix 0xffff8800aeac4200            28998 socket
hackbench 14802 root   16u  unix 0xffff8800aeac4800            29000 socket
hackbench 14802 root   17u  unix 0xffff8800aeac4e00            29002 socket
hackbench 14802 root   18u  unix 0xffff8800aeac5400            29004 socket
hackbench 14802 root   19u  unix 0xffff8800aeac5a00            29006 socket
hackbench 14802 root   20u  unix 0xffff8800aeac6000            29008 socket
hackbench 14802 root   21u  unix 0xffff8800aeac6600            29010 socket
hackbench 14802 root   22u  unix 0xffff8800aeac6c00            29012 socket
hackbench 14802 root   23u  unix 0xffff8800aeac7200            29014 socket
hackbench 14802 root   24u  unix 0xffff8800aeac0f00            29016 socket
hackbench 14802 root   25u  unix 0xffff8800aeac0900            29018 socket
hackbench 14802 root   26u  unix 0xffff8800aeac7b00            29020 socket
hackbench 14802 root   27u  unix 0xffff8800aeac7500            29022 socket

All sockets structures (where all _hot_ locks reside) are on a single node.

^ permalink raw reply

* [patch] stmmac: use resource_size()
From: Dan Carpenter @ 2010-04-07  9:35 UTC (permalink / raw)
  To: Giuseppe Cavallaro
  Cc: Joe Perches, Tejun Heo, Christoph Lameter, netdev,
	kernel-janitors, David S. Miller

Resource size should be calculated as end - start + 1 because we start
counting at zero.  I changed the code to resource_size() to do the 
calculation.

Signed-off-by: Dan Carpenter <error27@gmail.com>

diff --git a/drivers/net/stmmac/stmmac_main.c b/drivers/net/stmmac/stmmac_main.c
index a214a16..4111a85 100644
--- a/drivers/net/stmmac/stmmac_main.c
+++ b/drivers/net/stmmac/stmmac_main.c
@@ -1686,7 +1686,7 @@ static int stmmac_dvr_probe(struct platform_device *pdev)
 	}
 	pr_info("done!\n");
 
-	if (!request_mem_region(res->start, (res->end - res->start),
+	if (!request_mem_region(res->start, resource_size(res),
 				pdev->name)) {
 		pr_err("%s: ERROR: memory allocation failed"
 		       "cannot get the I/O addr 0x%x\n",
@@ -1695,9 +1695,9 @@ static int stmmac_dvr_probe(struct platform_device *pdev)
 		goto out;
 	}
 
-	addr = ioremap(res->start, (res->end - res->start));
+	addr = ioremap(res->start, resource_size(res));
 	if (!addr) {
-		pr_err("%s: ERROR: memory mapping failed \n", __func__);
+		pr_err("%s: ERROR: memory mapping failed\n", __func__);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1775,7 +1775,7 @@ static int stmmac_dvr_probe(struct platform_device *pdev)
 out:
 	if (ret < 0) {
 		platform_set_drvdata(pdev, NULL);
-		release_mem_region(res->start, (res->end - res->start));
+		release_mem_region(res->start, resource_size(res));
 		if (addr != NULL)
 			iounmap(addr);
 	}
@@ -1813,7 +1813,7 @@ static int stmmac_dvr_remove(struct platform_device *pdev)
 
 	iounmap((void *)ndev->base_addr);
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	release_mem_region(res->start, (res->end - res->start));
+	release_mem_region(res->start, resource_size(res));
 
 	free_netdev(ndev);
 

^ permalink raw reply related

* Re: [PATCH v3 0/4] xfrm: add x86 CONFIG_COMPAT support
From: David Miller @ 2010-04-07 10:08 UTC (permalink / raw)
  To: fw; +Cc: netdev, johannes
In-Reply-To: <1270506431-25578-1-git-send-email-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Tue,  6 Apr 2010 00:27:07 +0200

> This has the consequence that compat support is restricted to
> applications that use sendmsg() to talk to the kernel (e.g. iproute2
> would work); because write() will not go through the socket compat
> layer (and thus, the MSG_CMSG_COMPAT won't be set on messages sent
> by means of write() ).
> 
> I sent a patch that solved this by adding a sys_compat_write syscall
> and a ->compat_aio_write() to struct file_operations to the
> vfs mailing list, but that patch was ignored by the vfs people,
> and the x86 folks did not exactly like the idea either.
> 
> So this leaves three alternatives:
> 1 - drop the whole idea and keep the current status.
> 2 - Add new structure definitions (with new numbering) that would work
>     everywhere, keep the old ones for backwards compatibility (This
>     was suggested by Arnd Bergmann).
> 3 - apply this patch set and tell userspace to move the sendmsg() when
>     they want to work with xfrm on x86_64 with 32 bit userland.
> 
> Other than that, I am out of ideas.

So do we know of any xfrm netlink apps that do not use sendmsg()?

I doubt there are any, and if that's true for the most part, then
option #3 seems the best.

We simply can't ignore all of the 32-bit xfrm netlink app binaries out
there and pretend they don't exist.  So #2 doesn't make any sense to
me at all.

And #1, not trying to make it work at all, to me is just as bad
as #2.

If we find a non-trivial number of apps using plain write() then
we might have to consider championing your vfs patch to the
lkml and vfs folks again.  I'll help if this is needed.

Please do some quick research on xfrm netlink sendmsg() vs.
write() usage and based upon your findings we'll act.

Thanks for keeping this work alive.

^ permalink raw reply

* [RFC PATCH net-next 1/4] gianfar: Added stub support for SIOCSHWTSTAMP
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

This ioctl command is required for enabling hardware time stamping support
for network packets, see Documentation/networking/timestamping.txt. At the
moment nothing will be done for all requests that enable time stamping and
thus ERANGE will be returned.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |   36 ++++++++++++++++++++++++++++++++++++
 1 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 080d1ce..309bab0 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -82,6 +82,7 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/in.h>
+#include <linux/net_tstamp.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -772,6 +773,38 @@ err_grp_init:
 	return err;
 }
 
+static int gfar_hwtstamp_ioctl(struct net_device *netdev,
+			struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config config;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+		break;
+	case HWTSTAMP_TX_ON:
+		return -ERANGE;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
 /* Ioctl MII Interface */
 static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
@@ -780,6 +813,9 @@ static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!netif_running(dev))
 		return -EINVAL;
 
+	if (cmd == SIOCSHWTSTAMP)
+		return gfar_hwtstamp_ioctl(dev, rq, cmd);
+
 	if (!priv->phydev)
 		return -ENODEV;
 
-- 
1.6.3.3

^ permalink raw reply related

* [RFC PATCH net-next 2/4] gianfar: Added timer feature for eTSEC
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

The timer clock module is an intrinsic feature of every eTSEC. It supports
hardware time stamping of all incoming and outgoing network packets. This
patch checks if the underlying hardware is an eTSEC and adds the
new FSL_GIANFAR_DEV_HAS_TIMER flag to the device flags. This flag is then
used in the SIOCSHWTSTAMP ioctl command to determine if HW time stamping
support is available.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |   12 +++++++++++-
 drivers/net/gianfar.h |    3 +++
 2 files changed, 14 insertions(+), 1 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 309bab0..41e7726 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -743,7 +743,8 @@ static int gfar_of_init(struct of_device *ofdev, struct net_device **pdev)
 			FSL_GIANFAR_DEV_HAS_CSUM |
 			FSL_GIANFAR_DEV_HAS_VLAN |
 			FSL_GIANFAR_DEV_HAS_MAGIC_PACKET |
-			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH;
+			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH |
+			FSL_GIANFAR_DEV_HAS_TIMER;
 
 	ctype = of_get_property(np, "phy-connection-type", NULL);
 
@@ -777,6 +778,7 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 			struct ifreq *ifr, int cmd)
 {
 	struct hwtstamp_config config;
+	struct gfar_private *priv = netdev_priv(netdev);
 
 	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
 		return -EFAULT;
@@ -787,8 +789,12 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 
 	switch (config.tx_type) {
 	case HWTSTAMP_TX_OFF:
+		priv->hwts_tx_en = 0;
 		break;
 	case HWTSTAMP_TX_ON:
+		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
+			return -ERANGE;
+		priv->hwts_tx_en = 1;
 		return -ERANGE;
 	default:
 		return -ERANGE;
@@ -796,8 +802,12 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 
 	switch (config.rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
+		priv->hwts_rx_en = 0;
 		break;
 	default:
+		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
+			return -ERANGE;
+		priv->hwts_rx_en = 1;
 		return -ERANGE;
 	}
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 17d25e7..380ea48 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -885,6 +885,7 @@ struct gfar {
 #define FSL_GIANFAR_DEV_HAS_MAGIC_PACKET	0x00000100
 #define FSL_GIANFAR_DEV_HAS_BD_STASHING		0x00000200
 #define FSL_GIANFAR_DEV_HAS_BUF_STASHING	0x00000400
+#define FSL_GIANFAR_DEV_HAS_TIMER		0x00000800
 
 #if (MAXGROUPS == 2)
 #define DEFAULT_MAPPING 	0xAA
@@ -1084,6 +1085,8 @@ struct gfar_private {
 		extended_hash:1,
 		bd_stash_en:1,
 		rx_filer_enable:1,
+		hwts_tx_en:1, /* HW time stamping enabled for TX packets */
+		hwts_rx_en:1, /* HW time stamping enabled for RX packets */
 		wol_en:1; /* Wake-on-LAN enabled */
 	unsigned short padding;
 
-- 
1.6.3.3

^ permalink raw reply related

* [RFC PATCH net-next 3/4] gianfar: Added raw hardware receive time stamp generation
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

This patch configures the eTSEC to insert time stamps into all received
packets as padding alignment bytes. During the clean_rx_ring operation
these raw time stamps are extracted and copied into the
skb_shared_hwtstamps struct of the skb if required.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |   30 +++++++++++++++++++++++++-----
 drivers/net/gianfar.h |    1 +
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 41e7726..9119879 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -378,6 +378,13 @@ static void gfar_init_mac(struct net_device *ndev)
 		rctrl |= RCTRL_PADDING(priv->padding);
 	}
 
+	/* Insert receive time stamps into padding alignment bytes */
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER) {
+		rctrl &= ~RCTRL_PAL_MASK;
+		rctrl |= RCTRL_PRSDEP_INIT | RCTRL_TS_ENABLE | RCTRL_PADDING(8);
+		priv->padding = 8;
+	}
+
 	/* keep vlan related bits if it's enabled */
 	if (priv->vlgrp) {
 		rctrl |= RCTRL_VLEX | RCTRL_PRSDEP_INIT;
@@ -502,7 +509,8 @@ void unlock_tx_qs(struct gfar_private *priv)
 /* Returns 1 if incoming frames use an FCB */
 static inline int gfar_uses_fcb(struct gfar_private *priv)
 {
-	return priv->vlgrp || priv->rx_csum_enable;
+	return priv->vlgrp || priv->rx_csum_enable ||
+		(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER);
 }
 
 static void free_tx_pointers(struct gfar_private *priv)
@@ -808,7 +816,8 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
 			return -ERANGE;
 		priv->hwts_rx_en = 1;
-		return -ERANGE;
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
 	}
 
 	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
@@ -1028,7 +1037,8 @@ static int gfar_probe(struct of_device *ofdev,
 	else
 		priv->padding = 0;
 
-	if (dev->features & NETIF_F_IP_CSUM)
+	if (dev->features & NETIF_F_IP_CSUM ||
+			priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)
 		dev->hard_header_len += GMAC_FCB_LEN;
 
 	/* Program the isrg regs only if number of grps > 1 */
@@ -2520,6 +2530,17 @@ static int gfar_process_frame(struct net_device *dev, struct sk_buff *skb,
 		skb_pull(skb, amount_pull);
 	}
 
+	/* Get receive timestamp from the skb */
+	if (priv->hwts_rx_en) {
+		struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+		u64 *ns = (u64 *) skb->data;
+		memset(shhwtstamps, 0, sizeof(*shhwtstamps));
+		shhwtstamps->hwtstamp = ns_to_ktime(*ns);
+	}
+
+	if (priv->padding)
+		skb_pull(skb, priv->padding);
+
 	if (priv->rx_csum_enable)
 		gfar_rx_checksum(skb, fcb);
 
@@ -2556,8 +2577,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 	bdp = rx_queue->cur_rx;
 	base = rx_queue->rx_bd_base;
 
-	amount_pull = (gfar_uses_fcb(priv) ? GMAC_FCB_LEN : 0) +
-		priv->padding;
+	amount_pull = (gfar_uses_fcb(priv) ? GMAC_FCB_LEN : 0);
 
 	while (!((bdp->status & RXBD_EMPTY) || (--rx_work_limit < 0))) {
 		struct sk_buff *newskb;
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 380ea48..cba2756 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -262,6 +262,7 @@ extern const char gfar_driver_version[];
 
 #define next_bd(bdp, base, ring_size) skip_bd(bdp, 1, base, ring_size)
 
+#define RCTRL_TS_ENABLE 	0x01000000
 #define RCTRL_PAL_MASK		0x001f0000
 #define RCTRL_VLEX		0x00002000
 #define RCTRL_FILREN		0x00001000
-- 
1.6.3.3

^ permalink raw reply related

* [RFC PATCH net-next 0/4] timestamping support for gianfar
From: Manfred Rudigier @ 2010-04-07  9:45 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

Hello,

this patch series adds support for hardware time stamping to gianfar. It uses
the new SO_TIMESTAMPING infrastructure to deliver raw hardware timestamps to
user space applications.

Freescale CPUs with an eTSEC are able to time stamp all incoming network packets
and can also time stamp transmit packets when instructed. The time stamps are
generated by the eTSEC timer clock module which is running either from an
external oscillator or internal clock. 

The submitted patches do not initialize the timer clock module since the
oscillator frequency might be different from board to board. Thus the user 
must configure the timer clock module by hand at the moment - otherwise no
time stamps will be reported. Below is a simple example code which 
shows how to configure the timer clock module on the P2020DS/RDB. It can be
used to quickly try out the patches.

Testing was done with the time stamping program from Patrick Ohly which can
be found in the kernel sources under Documentation/networking/timestamping.
I have verified the functionality on the MPC8313RDB, P2020DS and P2020RDB 
board with the latest net-2.6 kernel. Send and receive time stamps could be 
retrieved on all eTSEC ports.

Comments and suggestions are welcome.

Thanks,
Manfred

/**
 * @file etsec_tmr.c
 *
 * This simple kernel module demonstrates how to initialize the eTSEC timer
 * clock module for hardware timestamping on the P2020. It uses the eTSEC
 * internal clock (300Mhz) as clock source and programs the timer clock module
 * to count in nanoseconds. The timer resolution is 5ns. Further it configures
 * the eTSEC to insert transmit time stamps into the packet data after sending.
 *
 * For testing the timestamping.c program from the Linux kernel sources under
 * Documentation/networking/timestamping can be used. Time stamps will not be
 * reported until this module has been loaded.
 *
 * Usage example:
 *
 * [root@p2020ds root]# insmod etsec_tmr.ko
 * [root@p2020ds root]# ./timestamping eth0 SOF_TIMESTAMPING_TX_HARDWARE
 * SOF_TIMESTAMPING_RX_HARDWARE SOF_TIMESTAMPING_RAW_HARDWARE
 *
 * Copyright (C) 2010 OMICRON electronics
 * Author: Manfred Rudigier <manfred.rudigier@omicron.at>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA
 */
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
MODULE_LICENSE("GPL");

#define TMR_BASE   0xffe24e00 /* Timer base address of P2020 */
#define REG_SIZE   0x000000b0 /* Timer register size */
#define TMR_CTRL   0x00000000 /* Timer control register */
#define TMR_ADD    0x00000020 /* Timer drift compensation addend register */
#define TMR_PRSC   0x00000028 /* Timer prescale register */

static void* regs;

static int etsec_tmr_init(void)
{
	printk(KERN_INFO "etsec tmr init\n");
	if (!request_mem_region(TMR_BASE, REG_SIZE, "etsec_tmr")) {
		printk(KERN_ERR "request_mem_region failed");
		return -1;
	}
	regs = ioremap(TMR_BASE, REG_SIZE);
	if (!regs) {
		printk(KERN_ERR "ioremap failed");
		release_mem_region(TMR_BASE, REG_SIZE);
		return -1;
	}

	out_be32(regs + TMR_ADD, 0xaaaaaaab);
	out_be32(regs + TMR_PRSC, 200);
	out_be32(regs + TMR_CTRL, 0x00058005);
	return 0;
}

static void etsec_tmr_exit(void)
{
	out_be32(regs + TMR_CTRL, 0x00010001);
	iounmap(regs);
	release_mem_region(TMR_BASE, REG_SIZE);
	printk(KERN_INFO "etsec tmr release\n");
}

module_init(etsec_tmr_init);
module_exit(etsec_tmr_exit);

^ permalink raw reply

* [RFC PATCH net-next 4/4] gianfar: Added raw hardware transmit time stamp generation
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

This patch configures the eTSEC to time stamps outgoing packets that have
the skb_shared_tx->hardware flag set. The eTSEC is configured to write the
time stamps back to memory after the frame is transmitted. During the
clean_tx_ring operation these time stamps will be extracted and copied
into the skb_shared_hwtstamps struct of the skb if required.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |  108 ++++++++++++++++++++++++++++++++++++++++---------
 drivers/net/gianfar.h |    2 +-
 2 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 9119879..becc3f3 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -803,7 +803,7 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
 			return -ERANGE;
 		priv->hwts_tx_en = 1;
-		return -ERANGE;
+		break;
 	default:
 		return -ERANGE;
 	}
@@ -1982,23 +1982,29 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct netdev_queue *txq;
 	struct gfar __iomem *regs = NULL;
 	struct txfcb *fcb = NULL;
-	struct txbd8 *txbdp, *txbdp_start, *base;
+	struct txbd8 *txbdp, *txbdp_start, *base, *txbdp_tstamp = NULL;
 	u32 lstatus;
-	int i, rq = 0;
+	int i, rq = 0, do_tstamp = 0;
 	u32 bufaddr;
 	unsigned long flags;
-	unsigned int nr_frags, length;
-
+	unsigned int nr_frags, nr_txbds, length;
+	union skb_shared_tx *shtx;
 
 	rq = skb->queue_mapping;
 	tx_queue = priv->tx_queue[rq];
 	txq = netdev_get_tx_queue(dev, rq);
 	base = tx_queue->tx_bd_base;
 	regs = tx_queue->grp->regs;
+	shtx = skb_tx(skb);
+
+	/* check if time stamp should be generated */
+	if (unlikely(shtx->hardware && priv->hwts_tx_en))
+		do_tstamp = 1;
 
 	/* make space for additional header when fcb is needed */
 	if (((skb->ip_summed == CHECKSUM_PARTIAL) ||
-			(priv->vlgrp && vlan_tx_tag_present(skb))) &&
+			(priv->vlgrp && vlan_tx_tag_present(skb)) ||
+			unlikely(do_tstamp)) &&
 			(skb_headroom(skb) < GMAC_FCB_LEN)) {
 		struct sk_buff *skb_new;
 
@@ -2015,8 +2021,14 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* total number of fragments in the SKB */
 	nr_frags = skb_shinfo(skb)->nr_frags;
 
+	/* calculate the required number of TxBDs for this skb */
+	if (unlikely(do_tstamp))
+		nr_txbds = nr_frags + 2;
+	else
+		nr_txbds = nr_frags + 1;
+
 	/* check if there is space to queue this packet */
-	if ((nr_frags+1) > tx_queue->num_txbdfree) {
+	if (nr_txbds > tx_queue->num_txbdfree) {
 		/* no space, stop the queue */
 		netif_tx_stop_queue(txq);
 		dev->stats.tx_fifo_errors++;
@@ -2028,9 +2040,19 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	txq->tx_packets ++;
 
 	txbdp = txbdp_start = tx_queue->cur_tx;
+	lstatus = txbdp->lstatus;
+
+	/* Time stamp insertion requires one additional TxBD */
+	if (unlikely(do_tstamp))
+		txbdp_tstamp = txbdp = next_txbd(txbdp, base,
+				tx_queue->tx_ring_size);
 
 	if (nr_frags == 0) {
-		lstatus = txbdp->lstatus | BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
+		if (unlikely(do_tstamp))
+			txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_LAST |
+					TXBD_INTERRUPT);
+		else
+			lstatus |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
 	} else {
 		/* Place the fragment addresses and lengths into the TxBDs */
 		for (i = 0; i < nr_frags; i++) {
@@ -2076,11 +2098,32 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		gfar_tx_vlan(skb, fcb);
 	}
 
-	/* setup the TxBD length and buffer pointer for the first BD */
+	/* Setup tx hardware time stamping if requested */
+	if (unlikely(do_tstamp)) {
+		shtx->in_progress = 1;
+		if (fcb == NULL)
+			fcb = gfar_add_fcb(skb);
+		fcb->ptp = 1;
+		lstatus |= BD_LFLAG(TXBD_TOE);
+	}
+
 	txbdp_start->bufPtr = dma_map_single(&priv->ofdev->dev, skb->data,
 			skb_headlen(skb), DMA_TO_DEVICE);
 
-	lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
+	/*
+	 * If time stamping is requested one additional TxBD must be set up. The
+	 * first TxBD points to the FCB and must have a data length of
+	 * GMAC_FCB_LEN. The second TxBD points to the actual frame data with
+	 * the full frame length.
+	 */
+	if (unlikely(do_tstamp)) {
+		txbdp_tstamp->bufPtr = txbdp_start->bufPtr + GMAC_FCB_LEN;
+		txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_READY) |
+				(skb_headlen(skb) - GMAC_FCB_LEN);
+		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | GMAC_FCB_LEN;
+	} else {
+		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
+	}
 
 	/*
 	 * We can work in parallel with gfar_clean_tx_ring(), except
@@ -2120,7 +2163,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_queue->cur_tx = next_txbd(txbdp, base, tx_queue->tx_ring_size);
 
 	/* reduce TxBD free count */
-	tx_queue->num_txbdfree -= (nr_frags + 1);
+	tx_queue->num_txbdfree -= (nr_txbds);
 
 	dev->trans_start = jiffies;
 
@@ -2311,16 +2354,18 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	struct net_device *dev = tx_queue->dev;
 	struct gfar_private *priv = netdev_priv(dev);
 	struct gfar_priv_rx_q *rx_queue = NULL;
-	struct txbd8 *bdp;
+	struct txbd8 *bdp, *next = NULL;
 	struct txbd8 *lbdp = NULL;
 	struct txbd8 *base = tx_queue->tx_bd_base;
 	struct sk_buff *skb;
 	int skb_dirtytx;
 	int tx_ring_size = tx_queue->tx_ring_size;
-	int frags = 0;
+	int frags = 0, nr_txbds = 0;
 	int i;
 	int howmany = 0;
 	u32 lstatus;
+	size_t buflen;
+	union skb_shared_tx *shtx;
 
 	rx_queue = priv->rx_queue[tx_queue->qindex];
 	bdp = tx_queue->dirty_tx;
@@ -2330,7 +2375,18 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 		unsigned long flags;
 
 		frags = skb_shinfo(skb)->nr_frags;
-		lbdp = skip_txbd(bdp, frags, base, tx_ring_size);
+
+		/*
+		 * When time stamping, one additional TxBD must be freed.
+		 * Also, we need to dma_unmap_single() the TxPAL.
+		 */
+		shtx = skb_tx(skb);
+		if (unlikely(shtx->in_progress))
+			nr_txbds = frags + 2;
+		else
+			nr_txbds = frags + 1;
+
+		lbdp = skip_txbd(bdp, nr_txbds - 1, base, tx_ring_size);
 
 		lstatus = lbdp->lstatus;
 
@@ -2339,10 +2395,24 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 				(lstatus & BD_LENGTH_MASK))
 			break;
 
-		dma_unmap_single(&priv->ofdev->dev,
-				bdp->bufPtr,
-				bdp->length,
-				DMA_TO_DEVICE);
+		if (unlikely(shtx->in_progress)) {
+			next = next_txbd(bdp, base, tx_ring_size);
+			buflen = next->length + GMAC_FCB_LEN;
+		} else
+			buflen = bdp->length;
+
+		dma_unmap_single(&priv->ofdev->dev, bdp->bufPtr,
+				buflen, DMA_TO_DEVICE);
+
+		if (unlikely(shtx->in_progress)) {
+			struct skb_shared_hwtstamps shhwtstamps;
+			u64 *ns = (u64*) (((u32)skb->data + 0x10) & ~0x7);
+			memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+			shhwtstamps.hwtstamp = ns_to_ktime(*ns);
+			skb_tstamp_tx(skb, &shhwtstamps);
+			bdp->lstatus &= BD_LFLAG(TXBD_WRAP);
+			bdp = next;
+		}
 
 		bdp->lstatus &= BD_LFLAG(TXBD_WRAP);
 		bdp = next_txbd(bdp, base, tx_ring_size);
@@ -2374,7 +2444,7 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 
 		howmany++;
 		spin_lock_irqsave(&tx_queue->txlock, flags);
-		tx_queue->num_txbdfree += frags + 1;
+		tx_queue->num_txbdfree += nr_txbds;
 		spin_unlock_irqrestore(&tx_queue->txlock, flags);
 	}
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index cba2756..baed05c 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -540,7 +540,7 @@ struct txbd8
 
 struct txfcb {
 	u8	flags;
-	u8	reserved;
+	u8	ptp;    /* Flag to enable tx timestamping */
 	u8	l4os;	/* Level 4 Header Offset */
 	u8	l3os; 	/* Level 3 Header Offset */
 	u16	phcs;	/* Pseudo-header Checksum */
-- 
1.6.3.3

^ permalink raw reply related

* Re: [RFC PATCH net-next 1/4] gianfar: Added stub support for SIOCSHWTSTAMP
From: David Miller @ 2010-04-07 10:26 UTC (permalink / raw)
  To: Manfred.Rudigier; +Cc: sandeep.kumar, netdev, linuxppc-dev
In-Reply-To: <95DC1AA8EC908B48939B72CF375AA5E311A9F4DF@alice.at.omicron.at>

From: Manfred Rudigier <Manfred.Rudigier@omicron.at>
Date: Wed, 7 Apr 2010 11:46:08 +0200

> This ioctl command is required for enabling hardware time stamping support
> for network packets, see Documentation/networking/timestamping.txt. At the
> moment nothing will be done for all requests that enable time stamping and
> thus ERANGE will be returned.
> 
> Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>

This is completely pointless.

Something sane should happen so that every driver doesn't
need to add this stub code just to return -ERANGE.

^ permalink raw reply

* [PATCH 0/4] caching bundles, iteration 5
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Changes:
- ops->delete() is now called if flow_cache_genid was changed to
  ensure that resolver does not use stale data
- removed bumping of policy->genid when inserting new policy since
  flow_cache_genid ensures everything is regenerated (thanks Herbert!)
- added unlikely/likely to flow_cache_lookup to favor fast path
  (cache hit)
- added herbert's ack to 1/4 

Compiles, boots and VPN goes up on my test box. Earlier iterations
tested to stay up 3+ days without noticing leaks or other problems.

Timo Teras (4):
  flow: virtualize flow cache entry methods
  xfrm: cache bundles instead of policies for outgoing flows
  xfrm: remove policy garbage collection
  flow: delayed deletion of flow cache entries

 include/net/flow.h      |   23 ++-
 include/net/xfrm.h      |   12 +-
 net/core/flow.c         |  212 ++++++++-----
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  820 +++++++++++++++++++++++++----------------------
 6 files changed, 591 insertions(+), 529 deletions(-)


^ permalink raw reply

* [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/net/flow.h     |   23 +++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  128 +++++++++++++++++++++++++----------------------
 net/xfrm/xfrm_policy.c |  112 ++++++++++++++++++++++++++++--------------
 4 files changed, 163 insertions(+), 102 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..bb08692 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,26 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
+struct flow_cache_ops;
+
+struct flow_cache_object {
+	const struct flow_cache_ops *ops;
+};
+
+struct flow_cache_ops {
+	struct flow_cache_object *(*get)(struct flow_cache_object *);
+	int (*check)(struct flow_cache_object *);
+	void (*delete)(struct flow_cache_object *);
+};
+
+typedef struct flow_cache_object *(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_object *oldobj, void *ctx);
+
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..35396e2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_object flo;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..521df52 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,17 +26,16 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry		*next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct flow_cache_entry		**hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -44,7 +43,7 @@ struct flow_cache_percpu {
 };
 
 struct flow_flush_info {
-	struct flow_cache *		cache;
+	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
@@ -52,7 +51,7 @@ struct flow_flush_info {
 struct flow_cache {
 	u32				hash_shift;
 	unsigned long			order;
-	struct flow_cache_percpu *	percpu;
+	struct flow_cache_percpu	*percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
-		atomic_dec(fle->object_ref);
+		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_object *flo;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	flo = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,27 +198,17 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
-	if (!fle) {
+	if (unlikely(!fle)) {
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
@@ -219,33 +222,39 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->object = NULL;
 			fcp->hash_count++;
 		}
+	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
+	} else if (fle->object) {
+	        flo = fle->object;
+	        flo->ops->delete(flo);
+	        fle->object = NULL;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	flo = NULL;
+	if (fle) {
+		flo = fle->object;
+		fle->object = NULL;
+	}
+	flo = resolver(net, key, family, dir, flo, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (!IS_ERR(flo))
+			fle->object = flo;
+		else
+			fle->genid--;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
 	}
+ret_object:
+	local_bh_enable();
+	return flo;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +270,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
+			if (fle->object)
+				fle->object->ops->delete(fle->object);
 			fle->object = NULL;
-			atomic_dec(fle->object_ref);
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..7722bae 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +688,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +728,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +845,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +998,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1116,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1601,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_object *flo;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(flo);
+		if (IS_ERR(flo)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (flo)
+			policy = container_of(flo, struct xfrm_policy, flo);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1968,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

__xfrm_lookup() is called for each packet transmitted out of
system. The xfrm_find_bundle() does a linear search which can
kill system performance depending on how many bundles are
required per policy.

This modifies __xfrm_lookup() to store bundles directly in
the flow cache. If we did not get a hit, we just create a new
bundle instead of doing slow search. This means that we can now
get multiple xfrm_dst's for same flow (on per-cpu basis).

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/xfrm.h      |   10 +-
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  711 +++++++++++++++++++++++++----------------------
 4 files changed, 383 insertions(+), 391 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 35396e2..625dd61 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -267,7 +267,6 @@ struct xfrm_policy_afinfo {
 					       xfrm_address_t *saddr,
 					       xfrm_address_t *daddr);
 	int			(*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr);
-	struct dst_entry	*(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
@@ -483,13 +482,13 @@ struct xfrm_policy {
 	struct timer_list	timer;
 
 	struct flow_cache_object flo;
+	atomic_t		genid;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
 	struct xfrm_selector	selector;
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
-	struct dst_entry       *bundles;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
@@ -879,11 +878,15 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	struct flow_cache_object flo;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int num_pols, num_xfrms;
 #ifdef CONFIG_XFRM_SUB_POLICY
 	struct flowi *origin;
 	struct xfrm_selector *partner;
 #endif
-	u32 genid;
+	u32 xfrm_genid;
+	u32 policy_genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
 	u32 route_cookie;
@@ -893,6 +896,7 @@ struct xfrm_dst {
 #ifdef CONFIG_XFRM
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
+	xfrm_pols_put(xdst->pols, xdst->num_pols);
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e4a1483..1705476 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -59,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
-		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
-		    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
-		    xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm4_get_tos(struct flowi *fl)
 {
 	return fl->fl4_tos;
@@ -259,7 +238,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
-	.find_bundle = 		__xfrm4_find_bundle,
 	.decode_session =	_decode_session4,
 	.get_tos =		xfrm4_get_tos,
 	.init_path =		xfrm4_init_path,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ae18165..8c452fd 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -67,36 +67,6 @@ static int xfrm6_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	/* Still not clear if we should set fl->fl6_{src,dst}... */
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
-		struct in6_addr fl_dst_prefix, fl_src_prefix;
-
-		ipv6_addr_prefix(&fl_dst_prefix,
-				 &fl->fl6_dst,
-				 xdst->u.rt6.rt6i_dst.plen);
-		ipv6_addr_prefix(&fl_src_prefix,
-				 &fl->fl6_src,
-				 xdst->u.rt6.rt6i_src.plen);
-		if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
-		    ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET6,
-				   (xdst->u.rt6.rt6i_dst.plen != 128 ||
-				    xdst->u.rt6.rt6i_src.plen != 128))) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm6_get_tos(struct flowi *fl)
 {
 	return 0;
@@ -291,7 +261,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr = 		xfrm6_get_saddr,
-	.find_bundle =		__xfrm6_find_bundle,
 	.decode_session =	_decode_session6,
 	.get_tos =		xfrm6_get_tos,
 	.init_path =		xfrm6_init_path,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7722bae..06ccc71 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,6 +37,8 @@
 DEFINE_MUTEX(xfrm_cfg_mutex);
 EXPORT_SYMBOL(xfrm_cfg_mutex);
 
+static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
+static struct dst_entry *xfrm_policy_sk_bundles;
 static DEFINE_RWLOCK(xfrm_policy_lock);
 
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
@@ -50,6 +52,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
+static int stale_bundle(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
@@ -277,8 +280,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 {
 	BUG_ON(!policy->walk.dead);
 
-	BUG_ON(policy->bundles);
-
 	if (del_timer(&policy->timer))
 		BUG();
 
@@ -289,12 +290,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
 
 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 {
-	struct dst_entry *dst;
-
-	while ((dst = policy->bundles) != NULL) {
-		policy->bundles = dst->next;
-		dst_free(dst);
-	}
+	atomic_inc(&policy->genid);
 
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
@@ -572,7 +568,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct xfrm_policy *delpol;
 	struct hlist_head *chain;
 	struct hlist_node *entry, *newpos;
-	struct dst_entry *gc_list;
 	u32 mark = policy->mark.v & policy->mark.m;
 
 	write_lock_bh(&xfrm_policy_lock);
@@ -622,34 +617,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	else if (xfrm_bydst_should_resize(net, dir, NULL))
 		schedule_work(&net->xfrm.policy_hash_work);
 
-	read_lock_bh(&xfrm_policy_lock);
-	gc_list = NULL;
-	entry = &policy->bydst;
-	hlist_for_each_entry_continue(policy, entry, bydst) {
-		struct dst_entry *dst;
-
-		write_lock(&policy->lock);
-		dst = policy->bundles;
-		if (dst) {
-			struct dst_entry *tail = dst;
-			while (tail->next)
-				tail = tail->next;
-			tail->next = gc_list;
-			gc_list = dst;
-
-			policy->bundles = NULL;
-		}
-		write_unlock(&policy->lock);
-	}
-	read_unlock_bh(&xfrm_policy_lock);
-
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-
-		gc_list = dst->next;
-		dst_free(dst);
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
@@ -998,6 +965,19 @@ fail:
 	return ret;
 }
 
+static struct xfrm_policy *
+__xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_policy *pol;
+
+	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+	if (pol != NULL)
+		return pol;
+#endif
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+}
+
 static struct flow_cache_object *
 xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
 		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
@@ -1007,21 +987,10 @@ xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
 	if (old_obj)
 		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
-#ifdef CONFIG_XFRM_SUB_POLICY
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol))
+	pol = __xfrm_policy_lookup(net, fl, family, dir);
+	if (IS_ERR_OR_NULL(pol))
 		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-#endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol))
-		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-	return NULL;
 
-found:
 	/* Resolver returns two references:
 	 * one for cache and one for caller of flow_cache_lookup() */
 	xfrm_pol_hold(pol);
@@ -1313,18 +1282,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
  * still valid.
  */
 
-static struct dst_entry *
-xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
-{
-	struct dst_entry *x;
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	if (unlikely(afinfo == NULL))
-		return ERR_PTR(-EINVAL);
-	x = afinfo->find_bundle(fl, policy);
-	xfrm_policy_put_afinfo(afinfo);
-	return x;
-}
-
 static inline int xfrm_get_tos(struct flowi *fl, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1340,6 +1297,54 @@ static inline int xfrm_get_tos(struct flowi *fl, int family)
 	return tos;
 }
 
+static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (xdst->route == NULL) {
+		/* Dummy bundle - if it has xfrms we were not
+		 * able to build bundle as template resolution failed.
+		 * It means we need to try again resolving. */
+		if (xdst->num_xfrms > 0)
+			return NULL;
+	} else {
+		/* Real bundle */
+		if (stale_bundle(dst))
+			return NULL;
+	}
+
+	dst_hold(dst);
+	return flo;
+}
+
+static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (!xdst->route)
+		return 0;
+	if (stale_bundle(dst))
+		return 0;
+
+	return 1;
+}
+
+static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	dst_free(dst);
+}
+
+static const struct flow_cache_ops xfrm_bundle_fc_ops = {
+	.get = xfrm_bundle_flo_get,
+	.check = xfrm_bundle_flo_check,
+	.delete = xfrm_bundle_flo_delete,
+};
+
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1362,9 +1367,10 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 		BUG();
 	}
 	xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
-
 	xfrm_policy_put_afinfo(afinfo);
 
+	xdst->flo.ops = &xfrm_bundle_fc_ops;
+
 	return xdst;
 }
 
@@ -1402,6 +1408,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	return err;
 }
 
+
 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
  * all the metrics... Shortly, bundle a bundle.
  */
@@ -1465,7 +1472,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			dst_hold(dst);
 
 		dst1->xfrm = xfrm[i];
-		xdst->genid = xfrm[i]->genid;
+		xdst->xfrm_genid = xfrm[i]->genid;
 
 		dst1->obsolete = -1;
 		dst1->flags |= DST_HOST;
@@ -1558,7 +1565,186 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
+static int xfrm_expand_policies(struct flowi *fl, u16 family,
+				struct xfrm_policy **pols,
+				int *num_pols, int *num_xfrms)
+{
+	int i;
+
+	if (*num_pols == 0 || !pols[0]) {
+		*num_pols = 0;
+		*num_xfrms = 0;
+		return 0;
+	}
+	if (IS_ERR(pols[0]))
+		return PTR_ERR(pols[0]);
+
+	*num_xfrms = pols[0]->xfrm_nr;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
+	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
+						    XFRM_POLICY_TYPE_MAIN,
+						    fl, family,
+						    XFRM_POLICY_OUT);
+		if (pols[1]) {
+			if (IS_ERR(pols[1])) {
+				xfrm_pols_put(pols, *num_pols);
+				return PTR_ERR(pols[1]);
+			}
+			(*num_pols) ++;
+			(*num_xfrms) += pols[1]->xfrm_nr;
+		}
+	}
+#endif
+	for (i = 0; i < *num_pols; i++) {
+		if (pols[i]->action != XFRM_POLICY_ALLOW) {
+			*num_xfrms = -1;
+			break;
+		}
+	}
+
+	return 0;
+
+}
+
+static struct xfrm_dst *
+xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
+			       struct flowi *fl, u16 family,
+			       struct dst_entry *dst_orig)
+{
+	struct net *net = xp_net(pols[0]);
+	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct dst_entry *dst;
+	struct xfrm_dst *xdst;
+	int err;
+
+	/* Try to instantiate a bundle */
+	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
+	if (err < 0) {
+		if (err != -EAGAIN)
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	if (IS_ERR(dst)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+		return ERR_CAST(dst);
+	}
+
+	xdst = (struct xfrm_dst *)dst;
+	xdst->num_xfrms = err;
+	if (num_pols > 1)
+		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+	else
+		err = xfrm_dst_update_origin(dst, fl);
+	if (unlikely(err)) {
+		dst_free(dst);
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+		return ERR_PTR(err);
+	}
+
+	xdst->num_pols = num_pols;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+	xdst->policy_genid = atomic_read(&pols[0]->genid);
+
+	return xdst;
+}
+
+static struct flow_cache_object *
+xfrm_bundle_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir,
+		   struct flow_cache_object *oldflo, void *ctx)
+{
+	struct dst_entry *dst_orig = (struct dst_entry *)ctx;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	struct xfrm_dst *xdst, *new_xdst;
+	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+
+	/* Check if the policies from old bundle are usable */
+	xdst = NULL;
+	if (oldflo) {
+		xdst = container_of(oldflo, struct xfrm_dst, flo);
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		pol_dead = 0;
+		for (i = 0; i < num_pols; i++) {
+			pols[i] = xdst->pols[i];
+			pol_dead |= pols[i]->walk.dead;
+		}
+		if (pol_dead) {
+			dst_free(&xdst->u.dst);
+			xdst = NULL;
+			num_pols = 0;
+			num_xfrms = 0;
+			oldflo = NULL;
+		}
+	}
+
+	/* Resolve policies to use if we couldn't get them from
+	 * previous cache entry */
+	if (xdst == NULL) {
+		num_pols = 1;
+		pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
+			goto inc_error;
+		if (num_pols == 0)
+			return NULL;
+		if (num_xfrms <= 0)
+			goto make_dummy_bundle;
+	}
+
+	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
+	if (IS_ERR(new_xdst)) {
+		err = PTR_ERR(new_xdst);
+		if (err != -EAGAIN)
+			goto error;
+		if (oldflo == NULL)
+			goto make_dummy_bundle;
+		dst_hold(&xdst->u.dst);
+		return oldflo;
+	}
+
+	/* Kill the previous bundle */
+	if (xdst) {
+		/* The policies were stolen for newly generated bundle */
+		xdst->num_pols = 0;
+		dst_free(&xdst->u.dst);
+	}
+
+	/* Flow cache does not have reference, it dst_free()'s,
+	 * but we do need to return one reference for original caller */
+	dst_hold(&new_xdst->u.dst);
+	return &new_xdst->flo;
+
+make_dummy_bundle:
+	/* We found policies, but there's no bundles to instantiate:
+	 * either because the policy blocks, has no transformations or
+	 * we could not build template (no xfrm_states).*/
+	xdst = xfrm_alloc_dst(net, family);
+	if (IS_ERR(xdst)) {
+		xfrm_pols_put(pols, num_pols);
+		return ERR_CAST(xdst);
+	}
+	xdst->num_pols = num_pols;
+	xdst->num_xfrms = num_xfrms;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+
+	dst_hold(&xdst->u.dst);
+	return &xdst->flo;
+
+inc_error:
+	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+error:
+	if (xdst != NULL)
+		dst_free(&xdst->u.dst);
+	else
+		xfrm_pols_put(pols, num_pols);
+	return ERR_PTR(err);
+}
 
 /* Main function: finds/creates a bundle for given flow.
  *
@@ -1568,248 +1754,152 @@ static int stale_bundle(struct dst_entry *dst);
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
-	struct xfrm_policy *policy;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-	int npols;
-	int pol_dead;
-	int xfrm_nr;
-	int pi;
-	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
-	struct dst_entry *dst, *dst_orig = *dst_p;
-	int nx = 0;
-	int err;
-	u32 genid;
-	u16 family;
+	struct flow_cache_object *flo;
+	struct xfrm_dst *xdst;
+	struct dst_entry *dst, *dst_orig = *dst_p, *route;
+	u16 family = dst_orig->ops->family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	int i, err, num_pols, num_xfrms, drop_pols = 0;
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
-	policy = NULL;
-	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
-		pols[pi] = NULL;
-	npols = 0;
-	pol_dead = 0;
-	xfrm_nr = 0;
+	dst = NULL;
+	xdst = NULL;
+	route = NULL;
 
 	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		num_pols = 1;
+		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
 			goto dropdst;
+
+		if (num_pols) {
+			if (num_xfrms <= 0) {
+				drop_pols = num_pols;
+				goto no_transform;
+			}
+
+			xdst = xfrm_resolve_and_create_bundle(
+					pols, num_pols, fl,
+					family, dst_orig);
+			if (IS_ERR(xdst)) {
+				xfrm_pols_put(pols, num_pols);
+				err = PTR_ERR(xdst);
+				goto dropdst;
+			}
+
+			spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+			xdst->u.dst.next = xfrm_policy_sk_bundles;
+			xfrm_policy_sk_bundles = &xdst->u.dst;
+			spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+			route = xdst->route;
 		}
 	}
 
-	if (!policy) {
-		struct flow_cache_object *flo;
-
+	if (xdst == NULL) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					dir, xfrm_policy_lookup, NULL);
-		err = PTR_ERR(flo);
+		flo = flow_cache_lookup(net, fl, family, dir,
+					xfrm_bundle_lookup, dst_orig);
+		if (flo == NULL)
+			goto nopol;
 		if (IS_ERR(flo)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+			err = PTR_ERR(flo);
 			goto dropdst;
 		}
-		if (flo)
-			policy = container_of(flo, struct xfrm_policy, flo);
-		else
-			policy = NULL;
+		xdst = container_of(flo, struct xfrm_dst, flo);
+
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
+		route = xdst->route;
+	}
+
+	dst = &xdst->u.dst;
+	if (route == NULL && num_xfrms > 0) {
+		/* The only case when xfrm_bundle_lookup() returns a
+		 * bundle with null route, is when the template could
+		 * not be resolved. It means policies are there, but
+		 * bundle could not be created, since we don't yet
+		 * have the xfrm_state's. We need to wait for KM to
+		 * negotiate new SA's or bail out with error.*/
+		if (net->xfrm.sysctl_larval_drop) {
+			/* EREMOTE tells the caller to generate
+			 * a one-shot blackhole route. */
+			dst_release(dst);
+			xfrm_pols_put(pols, num_pols);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+			return -EREMOTE;
+		}
+		if (flags & XFRM_LOOKUP_WAIT) {
+			DECLARE_WAITQUEUE(wait, current);
+
+			add_wait_queue(&net->xfrm.km_waitq, &wait);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&net->xfrm.km_waitq, &wait);
+
+			if (!signal_pending(current)) {
+				dst_release(dst);
+				goto restart;
+			}
+
+			err = -ERESTART;
+		} else
+			err = -EAGAIN;
+
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+		goto error;
 	}
 
-	if (!policy)
+no_transform:
+	if (num_pols == 0)
 		goto nopol;
 
-	family = dst_orig->ops->family;
-	pols[0] = policy;
-	npols ++;
-	xfrm_nr += pols[0]->xfrm_nr;
-
-	err = -ENOENT;
-	if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP))
+	if ((flags & XFRM_LOOKUP_ICMP) &&
+	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
+		err = -ENOENT;
 		goto error;
+	}
 
-	policy->curlft.use_time = get_seconds();
+	for (i = 0; i < num_pols; i++)
+		pols[i]->curlft.use_time = get_seconds();
 
-	switch (policy->action) {
-	default:
-	case XFRM_POLICY_BLOCK:
+	if (num_xfrms < 0) {
 		/* Prohibit the flow */
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
 		err = -EPERM;
 		goto error;
-
-	case XFRM_POLICY_ALLOW:
-#ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
-#endif
-
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
-		dst = xfrm_find_bundle(fl, policy, family);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = PTR_ERR(dst);
-			goto error;
-		}
-
-		if (dst)
-			break;
-
-#ifdef CONFIG_XFRM_SUB_POLICY
-		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
-			pols[1] = xfrm_policy_lookup_bytype(net,
-							    XFRM_POLICY_TYPE_MAIN,
-							    fl, family,
-							    XFRM_POLICY_OUT);
-			if (pols[1]) {
-				if (IS_ERR(pols[1])) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-					err = PTR_ERR(pols[1]);
-					goto error;
-				}
-				if (pols[1]->action == XFRM_POLICY_BLOCK) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
-					err = -EPERM;
-					goto error;
-				}
-				npols ++;
-				xfrm_nr += pols[1]->xfrm_nr;
-			}
-		}
-
-		/*
-		 * Because neither flowi nor bundle information knows about
-		 * transformation template size. On more than one policy usage
-		 * we can realize whether all of them is bypass or not after
-		 * they are searched. See above not-transformed bypass
-		 * is surrounded by non-sub policy configuration, too.
-		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-#endif
-		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-		if (unlikely(nx<0)) {
-			err = nx;
-			if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) {
-				/* EREMOTE tells the caller to generate
-				 * a one-shot blackhole route.
-				 */
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				xfrm_pol_put(policy);
-				return -EREMOTE;
-			}
-			if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) {
-				DECLARE_WAITQUEUE(wait, current);
-
-				add_wait_queue(&net->xfrm.km_waitq, &wait);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule();
-				set_current_state(TASK_RUNNING);
-				remove_wait_queue(&net->xfrm.km_waitq, &wait);
-
-				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-				if (nx == -EAGAIN && signal_pending(current)) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-					err = -ERESTART;
-					goto error;
-				}
-				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
-					xfrm_pols_put(pols, npols);
-					goto restart;
-				}
-				err = nx;
-			}
-			if (err < 0) {
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				goto error;
-			}
-		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
-		err = PTR_ERR(dst);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
-			goto error;
-		}
-
-		for (pi = 0; pi < npols; pi++)
-			pol_dead |= pols[pi]->walk.dead;
-
-		write_lock_bh(&policy->lock);
-		if (unlikely(pol_dead || stale_bundle(dst))) {
-			/* Wow! While we worked on resolving, this
-			 * policy has gone. Retry. It is not paranoia,
-			 * we just cannot enlist new bundle to dead object.
-			 * We can't enlist stable bundles either.
-			 */
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-
-			if (pol_dead)
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD);
-			else
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = -EHOSTUNREACH;
-			goto error;
-		}
-
-		if (npols > 1)
-			err = xfrm_dst_update_parent(dst, &pols[1]->selector);
-		else
-			err = xfrm_dst_update_origin(dst, fl);
-		if (unlikely(err)) {
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			goto error;
-		}
-
-		dst->next = policy->bundles;
-		policy->bundles = dst;
-		dst_hold(dst);
-		write_unlock_bh(&policy->lock);
+	} else if (num_xfrms > 0) {
+		/* Flow transformed */
+		*dst_p = dst;
+		dst_release(dst_orig);
+	} else {
+		/* Flow passes untransformed */
+		dst_release(dst);
 	}
-	*dst_p = dst;
-	dst_release(dst_orig);
-	xfrm_pols_put(pols, npols);
+ok:
+	xfrm_pols_put(pols, drop_pols);
 	return 0;
 
+nopol:
+	if (!(flags & XFRM_LOOKUP_ICMP))
+		goto ok;
+	err = -ENOENT;
 error:
-	xfrm_pols_put(pols, npols);
+	dst_release(dst);
 dropdst:
 	dst_release(dst_orig);
 	*dst_p = NULL;
+	xfrm_pols_put(pols, drop_pols);
 	return err;
-
-nopol:
-	err = -ENOENT;
-	if (flags & XFRM_LOOKUP_ICMP)
-		goto dropdst;
-	return 0;
 }
 EXPORT_SYMBOL(__xfrm_lookup);
 
@@ -2161,71 +2251,24 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
-{
-	struct dst_entry *dst, **dstp;
-
-	write_lock(&pol->lock);
-	dstp = &pol->bundles;
-	while ((dst=*dstp) != NULL) {
-		if (func(dst)) {
-			*dstp = dst->next;
-			dst->next = *gc_list_p;
-			*gc_list_p = dst;
-		} else {
-			dstp = &dst->next;
-		}
-	}
-	write_unlock(&pol->lock);
-}
-
-static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
+static void __xfrm_garbage_collect(struct net *net)
 {
-	struct dst_entry *gc_list = NULL;
-	int dir;
+	struct dst_entry *head, *next;
 
-	read_lock_bh(&xfrm_policy_lock);
-	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
-		struct xfrm_policy *pol;
-		struct hlist_node *entry;
-		struct hlist_head *table;
-		int i;
+	flow_cache_flush();
 
-		hlist_for_each_entry(pol, entry,
-				     &net->xfrm.policy_inexact[dir], bydst)
-			prune_one_bundle(pol, func, &gc_list);
+	spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+	head = xfrm_policy_sk_bundles;
+	xfrm_policy_sk_bundles = NULL;
+	spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
 
-		table = net->xfrm.policy_bydst[dir].table;
-		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
-			hlist_for_each_entry(pol, entry, table + i, bydst)
-				prune_one_bundle(pol, func, &gc_list);
-		}
-	}
-	read_unlock_bh(&xfrm_policy_lock);
-
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-		gc_list = dst->next;
-		dst_free(dst);
+	while (head) {
+		next = head->next;
+		dst_free(head);
+		head = next;
 	}
 }
 
-static int unused_bundle(struct dst_entry *dst)
-{
-	return !atomic_read(&dst->__refcnt);
-}
-
-static void __xfrm_garbage_collect(struct net *net)
-{
-	xfrm_prune_bundles(net, unused_bundle);
-}
-
-static int xfrm_flush_bundles(struct net *net)
-{
-	xfrm_prune_bundles(net, stale_bundle);
-	return 0;
-}
-
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -2283,7 +2326,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
-		if (xdst->genid != dst->xfrm->genid)
+		if (xdst->xfrm_genid != dst->xfrm->genid)
+			return 0;
+		if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
 		if (strict && fl &&
@@ -2448,7 +2493,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
 
 	switch (event) {
 	case NETDEV_DOWN:
-		xfrm_flush_bundles(dev_net(dev));
+		__xfrm_garbage_collect(dev_net(dev));
 	}
 	return NOTIFY_DONE;
 }
@@ -2780,7 +2825,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2805,10 +2849,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			atomic_inc(&pol->genid);
 		}
 	}
 
-- 
1.6.3.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox