Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH net-next 2/4] gianfar: Added timer feature for eTSEC
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

The timer clock module is an intrinsic feature of every eTSEC. It supports
hardware time stamping of all incoming and outgoing network packets. This
patch checks if the underlying hardware is an eTSEC and adds the
new FSL_GIANFAR_DEV_HAS_TIMER flag to the device flags. This flag is then
used in the SIOCSHWTSTAMP ioctl command to determine if HW time stamping
support is available.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |   12 +++++++++++-
 drivers/net/gianfar.h |    3 +++
 2 files changed, 14 insertions(+), 1 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 309bab0..41e7726 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -743,7 +743,8 @@ static int gfar_of_init(struct of_device *ofdev, struct net_device **pdev)
 			FSL_GIANFAR_DEV_HAS_CSUM |
 			FSL_GIANFAR_DEV_HAS_VLAN |
 			FSL_GIANFAR_DEV_HAS_MAGIC_PACKET |
-			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH;
+			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH |
+			FSL_GIANFAR_DEV_HAS_TIMER;
 
 	ctype = of_get_property(np, "phy-connection-type", NULL);
 
@@ -777,6 +778,7 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 			struct ifreq *ifr, int cmd)
 {
 	struct hwtstamp_config config;
+	struct gfar_private *priv = netdev_priv(netdev);
 
 	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
 		return -EFAULT;
@@ -787,8 +789,12 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 
 	switch (config.tx_type) {
 	case HWTSTAMP_TX_OFF:
+		priv->hwts_tx_en = 0;
 		break;
 	case HWTSTAMP_TX_ON:
+		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
+			return -ERANGE;
+		priv->hwts_tx_en = 1;
 		return -ERANGE;
 	default:
 		return -ERANGE;
@@ -796,8 +802,12 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 
 	switch (config.rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
+		priv->hwts_rx_en = 0;
 		break;
 	default:
+		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
+			return -ERANGE;
+		priv->hwts_rx_en = 1;
 		return -ERANGE;
 	}
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 17d25e7..380ea48 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -885,6 +885,7 @@ struct gfar {
 #define FSL_GIANFAR_DEV_HAS_MAGIC_PACKET	0x00000100
 #define FSL_GIANFAR_DEV_HAS_BD_STASHING		0x00000200
 #define FSL_GIANFAR_DEV_HAS_BUF_STASHING	0x00000400
+#define FSL_GIANFAR_DEV_HAS_TIMER		0x00000800
 
 #if (MAXGROUPS == 2)
 #define DEFAULT_MAPPING 	0xAA
@@ -1084,6 +1085,8 @@ struct gfar_private {
 		extended_hash:1,
 		bd_stash_en:1,
 		rx_filer_enable:1,
+		hwts_tx_en:1, /* HW time stamping enabled for TX packets */
+		hwts_rx_en:1, /* HW time stamping enabled for RX packets */
 		wol_en:1; /* Wake-on-LAN enabled */
 	unsigned short padding;
 
-- 
1.6.3.3

^ permalink raw reply related

* [RFC PATCH net-next 3/4] gianfar: Added raw hardware receive time stamp generation
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

This patch configures the eTSEC to insert time stamps into all received
packets as padding alignment bytes. During the clean_rx_ring operation
these raw time stamps are extracted and copied into the
skb_shared_hwtstamps struct of the skb if required.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |   30 +++++++++++++++++++++++++-----
 drivers/net/gianfar.h |    1 +
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 41e7726..9119879 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -378,6 +378,13 @@ static void gfar_init_mac(struct net_device *ndev)
 		rctrl |= RCTRL_PADDING(priv->padding);
 	}
 
+	/* Insert receive time stamps into padding alignment bytes */
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER) {
+		rctrl &= ~RCTRL_PAL_MASK;
+		rctrl |= RCTRL_PRSDEP_INIT | RCTRL_TS_ENABLE | RCTRL_PADDING(8);
+		priv->padding = 8;
+	}
+
 	/* keep vlan related bits if it's enabled */
 	if (priv->vlgrp) {
 		rctrl |= RCTRL_VLEX | RCTRL_PRSDEP_INIT;
@@ -502,7 +509,8 @@ void unlock_tx_qs(struct gfar_private *priv)
 /* Returns 1 if incoming frames use an FCB */
 static inline int gfar_uses_fcb(struct gfar_private *priv)
 {
-	return priv->vlgrp || priv->rx_csum_enable;
+	return priv->vlgrp || priv->rx_csum_enable ||
+		(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER);
 }
 
 static void free_tx_pointers(struct gfar_private *priv)
@@ -808,7 +816,8 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
 			return -ERANGE;
 		priv->hwts_rx_en = 1;
-		return -ERANGE;
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
 	}
 
 	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
@@ -1028,7 +1037,8 @@ static int gfar_probe(struct of_device *ofdev,
 	else
 		priv->padding = 0;
 
-	if (dev->features & NETIF_F_IP_CSUM)
+	if (dev->features & NETIF_F_IP_CSUM ||
+			priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)
 		dev->hard_header_len += GMAC_FCB_LEN;
 
 	/* Program the isrg regs only if number of grps > 1 */
@@ -2520,6 +2530,17 @@ static int gfar_process_frame(struct net_device *dev, struct sk_buff *skb,
 		skb_pull(skb, amount_pull);
 	}
 
+	/* Get receive timestamp from the skb */
+	if (priv->hwts_rx_en) {
+		struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+		u64 *ns = (u64 *) skb->data;
+		memset(shhwtstamps, 0, sizeof(*shhwtstamps));
+		shhwtstamps->hwtstamp = ns_to_ktime(*ns);
+	}
+
+	if (priv->padding)
+		skb_pull(skb, priv->padding);
+
 	if (priv->rx_csum_enable)
 		gfar_rx_checksum(skb, fcb);
 
@@ -2556,8 +2577,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 	bdp = rx_queue->cur_rx;
 	base = rx_queue->rx_bd_base;
 
-	amount_pull = (gfar_uses_fcb(priv) ? GMAC_FCB_LEN : 0) +
-		priv->padding;
+	amount_pull = (gfar_uses_fcb(priv) ? GMAC_FCB_LEN : 0);
 
 	while (!((bdp->status & RXBD_EMPTY) || (--rx_work_limit < 0))) {
 		struct sk_buff *newskb;
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 380ea48..cba2756 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -262,6 +262,7 @@ extern const char gfar_driver_version[];
 
 #define next_bd(bdp, base, ring_size) skip_bd(bdp, 1, base, ring_size)
 
+#define RCTRL_TS_ENABLE 	0x01000000
 #define RCTRL_PAL_MASK		0x001f0000
 #define RCTRL_VLEX		0x00002000
 #define RCTRL_FILREN		0x00001000
-- 
1.6.3.3

^ permalink raw reply related

* [RFC PATCH net-next 0/4] timestamping support for gianfar
From: Manfred Rudigier @ 2010-04-07  9:45 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

Hello,

this patch series adds support for hardware time stamping to gianfar. It uses
the new SO_TIMESTAMPING infrastructure to deliver raw hardware timestamps to
user space applications.

Freescale CPUs with an eTSEC are able to time stamp all incoming network packets
and can also time stamp transmit packets when instructed. The time stamps are
generated by the eTSEC timer clock module which is running either from an
external oscillator or internal clock. 

The submitted patches do not initialize the timer clock module since the
oscillator frequency might be different from board to board. Thus the user 
must configure the timer clock module by hand at the moment - otherwise no
time stamps will be reported. Below is a simple example code which 
shows how to configure the timer clock module on the P2020DS/RDB. It can be
used to quickly try out the patches.

Testing was done with the time stamping program from Patrick Ohly which can
be found in the kernel sources under Documentation/networking/timestamping.
I have verified the functionality on the MPC8313RDB, P2020DS and P2020RDB 
board with the latest net-2.6 kernel. Send and receive time stamps could be 
retrieved on all eTSEC ports.

Comments and suggestions are welcome.

Thanks,
Manfred

/**
 * @file etsec_tmr.c
 *
 * This simple kernel module demonstrates how to initialize the eTSEC timer
 * clock module for hardware timestamping on the P2020. It uses the eTSEC
 * internal clock (300Mhz) as clock source and programs the timer clock module
 * to count in nanoseconds. The timer resolution is 5ns. Further it configures
 * the eTSEC to insert transmit time stamps into the packet data after sending.
 *
 * For testing the timestamping.c program from the Linux kernel sources under
 * Documentation/networking/timestamping can be used. Time stamps will not be
 * reported until this module has been loaded.
 *
 * Usage example:
 *
 * [root@p2020ds root]# insmod etsec_tmr.ko
 * [root@p2020ds root]# ./timestamping eth0 SOF_TIMESTAMPING_TX_HARDWARE
 * SOF_TIMESTAMPING_RX_HARDWARE SOF_TIMESTAMPING_RAW_HARDWARE
 *
 * Copyright (C) 2010 OMICRON electronics
 * Author: Manfred Rudigier <manfred.rudigier@omicron.at>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA
 */
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
MODULE_LICENSE("GPL");

#define TMR_BASE   0xffe24e00 /* Timer base address of P2020 */
#define REG_SIZE   0x000000b0 /* Timer register size */
#define TMR_CTRL   0x00000000 /* Timer control register */
#define TMR_ADD    0x00000020 /* Timer drift compensation addend register */
#define TMR_PRSC   0x00000028 /* Timer prescale register */

static void* regs;

static int etsec_tmr_init(void)
{
	printk(KERN_INFO "etsec tmr init\n");
	if (!request_mem_region(TMR_BASE, REG_SIZE, "etsec_tmr")) {
		printk(KERN_ERR "request_mem_region failed");
		return -1;
	}
	regs = ioremap(TMR_BASE, REG_SIZE);
	if (!regs) {
		printk(KERN_ERR "ioremap failed");
		release_mem_region(TMR_BASE, REG_SIZE);
		return -1;
	}

	out_be32(regs + TMR_ADD, 0xaaaaaaab);
	out_be32(regs + TMR_PRSC, 200);
	out_be32(regs + TMR_CTRL, 0x00058005);
	return 0;
}

static void etsec_tmr_exit(void)
{
	out_be32(regs + TMR_CTRL, 0x00010001);
	iounmap(regs);
	release_mem_region(TMR_BASE, REG_SIZE);
	printk(KERN_INFO "etsec tmr release\n");
}

module_init(etsec_tmr_init);
module_exit(etsec_tmr_exit);

^ permalink raw reply

* [RFC PATCH net-next 4/4] gianfar: Added raw hardware transmit time stamp generation
From: Manfred Rudigier @ 2010-04-07  9:46 UTC (permalink / raw)
  To: 'sandeep.kumar@freescale.com'
  Cc: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'

This patch configures the eTSEC to time stamps outgoing packets that have
the skb_shared_tx->hardware flag set. The eTSEC is configured to write the
time stamps back to memory after the frame is transmitted. During the
clean_tx_ring operation these time stamps will be extracted and copied
into the skb_shared_hwtstamps struct of the skb if required.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |  108 ++++++++++++++++++++++++++++++++++++++++---------
 drivers/net/gianfar.h |    2 +-
 2 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 9119879..becc3f3 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -803,7 +803,7 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
 			return -ERANGE;
 		priv->hwts_tx_en = 1;
-		return -ERANGE;
+		break;
 	default:
 		return -ERANGE;
 	}
@@ -1982,23 +1982,29 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct netdev_queue *txq;
 	struct gfar __iomem *regs = NULL;
 	struct txfcb *fcb = NULL;
-	struct txbd8 *txbdp, *txbdp_start, *base;
+	struct txbd8 *txbdp, *txbdp_start, *base, *txbdp_tstamp = NULL;
 	u32 lstatus;
-	int i, rq = 0;
+	int i, rq = 0, do_tstamp = 0;
 	u32 bufaddr;
 	unsigned long flags;
-	unsigned int nr_frags, length;
-
+	unsigned int nr_frags, nr_txbds, length;
+	union skb_shared_tx *shtx;
 
 	rq = skb->queue_mapping;
 	tx_queue = priv->tx_queue[rq];
 	txq = netdev_get_tx_queue(dev, rq);
 	base = tx_queue->tx_bd_base;
 	regs = tx_queue->grp->regs;
+	shtx = skb_tx(skb);
+
+	/* check if time stamp should be generated */
+	if (unlikely(shtx->hardware && priv->hwts_tx_en))
+		do_tstamp = 1;
 
 	/* make space for additional header when fcb is needed */
 	if (((skb->ip_summed == CHECKSUM_PARTIAL) ||
-			(priv->vlgrp && vlan_tx_tag_present(skb))) &&
+			(priv->vlgrp && vlan_tx_tag_present(skb)) ||
+			unlikely(do_tstamp)) &&
 			(skb_headroom(skb) < GMAC_FCB_LEN)) {
 		struct sk_buff *skb_new;
 
@@ -2015,8 +2021,14 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* total number of fragments in the SKB */
 	nr_frags = skb_shinfo(skb)->nr_frags;
 
+	/* calculate the required number of TxBDs for this skb */
+	if (unlikely(do_tstamp))
+		nr_txbds = nr_frags + 2;
+	else
+		nr_txbds = nr_frags + 1;
+
 	/* check if there is space to queue this packet */
-	if ((nr_frags+1) > tx_queue->num_txbdfree) {
+	if (nr_txbds > tx_queue->num_txbdfree) {
 		/* no space, stop the queue */
 		netif_tx_stop_queue(txq);
 		dev->stats.tx_fifo_errors++;
@@ -2028,9 +2040,19 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	txq->tx_packets ++;
 
 	txbdp = txbdp_start = tx_queue->cur_tx;
+	lstatus = txbdp->lstatus;
+
+	/* Time stamp insertion requires one additional TxBD */
+	if (unlikely(do_tstamp))
+		txbdp_tstamp = txbdp = next_txbd(txbdp, base,
+				tx_queue->tx_ring_size);
 
 	if (nr_frags == 0) {
-		lstatus = txbdp->lstatus | BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
+		if (unlikely(do_tstamp))
+			txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_LAST |
+					TXBD_INTERRUPT);
+		else
+			lstatus |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
 	} else {
 		/* Place the fragment addresses and lengths into the TxBDs */
 		for (i = 0; i < nr_frags; i++) {
@@ -2076,11 +2098,32 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		gfar_tx_vlan(skb, fcb);
 	}
 
-	/* setup the TxBD length and buffer pointer for the first BD */
+	/* Setup tx hardware time stamping if requested */
+	if (unlikely(do_tstamp)) {
+		shtx->in_progress = 1;
+		if (fcb == NULL)
+			fcb = gfar_add_fcb(skb);
+		fcb->ptp = 1;
+		lstatus |= BD_LFLAG(TXBD_TOE);
+	}
+
 	txbdp_start->bufPtr = dma_map_single(&priv->ofdev->dev, skb->data,
 			skb_headlen(skb), DMA_TO_DEVICE);
 
-	lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
+	/*
+	 * If time stamping is requested one additional TxBD must be set up. The
+	 * first TxBD points to the FCB and must have a data length of
+	 * GMAC_FCB_LEN. The second TxBD points to the actual frame data with
+	 * the full frame length.
+	 */
+	if (unlikely(do_tstamp)) {
+		txbdp_tstamp->bufPtr = txbdp_start->bufPtr + GMAC_FCB_LEN;
+		txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_READY) |
+				(skb_headlen(skb) - GMAC_FCB_LEN);
+		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | GMAC_FCB_LEN;
+	} else {
+		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
+	}
 
 	/*
 	 * We can work in parallel with gfar_clean_tx_ring(), except
@@ -2120,7 +2163,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_queue->cur_tx = next_txbd(txbdp, base, tx_queue->tx_ring_size);
 
 	/* reduce TxBD free count */
-	tx_queue->num_txbdfree -= (nr_frags + 1);
+	tx_queue->num_txbdfree -= (nr_txbds);
 
 	dev->trans_start = jiffies;
 
@@ -2311,16 +2354,18 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	struct net_device *dev = tx_queue->dev;
 	struct gfar_private *priv = netdev_priv(dev);
 	struct gfar_priv_rx_q *rx_queue = NULL;
-	struct txbd8 *bdp;
+	struct txbd8 *bdp, *next = NULL;
 	struct txbd8 *lbdp = NULL;
 	struct txbd8 *base = tx_queue->tx_bd_base;
 	struct sk_buff *skb;
 	int skb_dirtytx;
 	int tx_ring_size = tx_queue->tx_ring_size;
-	int frags = 0;
+	int frags = 0, nr_txbds = 0;
 	int i;
 	int howmany = 0;
 	u32 lstatus;
+	size_t buflen;
+	union skb_shared_tx *shtx;
 
 	rx_queue = priv->rx_queue[tx_queue->qindex];
 	bdp = tx_queue->dirty_tx;
@@ -2330,7 +2375,18 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 		unsigned long flags;
 
 		frags = skb_shinfo(skb)->nr_frags;
-		lbdp = skip_txbd(bdp, frags, base, tx_ring_size);
+
+		/*
+		 * When time stamping, one additional TxBD must be freed.
+		 * Also, we need to dma_unmap_single() the TxPAL.
+		 */
+		shtx = skb_tx(skb);
+		if (unlikely(shtx->in_progress))
+			nr_txbds = frags + 2;
+		else
+			nr_txbds = frags + 1;
+
+		lbdp = skip_txbd(bdp, nr_txbds - 1, base, tx_ring_size);
 
 		lstatus = lbdp->lstatus;
 
@@ -2339,10 +2395,24 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 				(lstatus & BD_LENGTH_MASK))
 			break;
 
-		dma_unmap_single(&priv->ofdev->dev,
-				bdp->bufPtr,
-				bdp->length,
-				DMA_TO_DEVICE);
+		if (unlikely(shtx->in_progress)) {
+			next = next_txbd(bdp, base, tx_ring_size);
+			buflen = next->length + GMAC_FCB_LEN;
+		} else
+			buflen = bdp->length;
+
+		dma_unmap_single(&priv->ofdev->dev, bdp->bufPtr,
+				buflen, DMA_TO_DEVICE);
+
+		if (unlikely(shtx->in_progress)) {
+			struct skb_shared_hwtstamps shhwtstamps;
+			u64 *ns = (u64*) (((u32)skb->data + 0x10) & ~0x7);
+			memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+			shhwtstamps.hwtstamp = ns_to_ktime(*ns);
+			skb_tstamp_tx(skb, &shhwtstamps);
+			bdp->lstatus &= BD_LFLAG(TXBD_WRAP);
+			bdp = next;
+		}
 
 		bdp->lstatus &= BD_LFLAG(TXBD_WRAP);
 		bdp = next_txbd(bdp, base, tx_ring_size);
@@ -2374,7 +2444,7 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 
 		howmany++;
 		spin_lock_irqsave(&tx_queue->txlock, flags);
-		tx_queue->num_txbdfree += frags + 1;
+		tx_queue->num_txbdfree += nr_txbds;
 		spin_unlock_irqrestore(&tx_queue->txlock, flags);
 	}
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index cba2756..baed05c 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -540,7 +540,7 @@ struct txbd8
 
 struct txfcb {
 	u8	flags;
-	u8	reserved;
+	u8	ptp;    /* Flag to enable tx timestamping */
 	u8	l4os;	/* Level 4 Header Offset */
 	u8	l3os; 	/* Level 3 Header Offset */
 	u16	phcs;	/* Pseudo-header Checksum */
-- 
1.6.3.3

^ permalink raw reply related

* Re: [RFC PATCH net-next 1/4] gianfar: Added stub support for SIOCSHWTSTAMP
From: David Miller @ 2010-04-07 10:26 UTC (permalink / raw)
  To: Manfred.Rudigier; +Cc: sandeep.kumar, netdev, linuxppc-dev
In-Reply-To: <95DC1AA8EC908B48939B72CF375AA5E311A9F4DF@alice.at.omicron.at>

From: Manfred Rudigier <Manfred.Rudigier@omicron.at>
Date: Wed, 7 Apr 2010 11:46:08 +0200

> This ioctl command is required for enabling hardware time stamping support
> for network packets, see Documentation/networking/timestamping.txt. At the
> moment nothing will be done for all requests that enable time stamping and
> thus ERANGE will be returned.
> 
> Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>

This is completely pointless.

Something sane should happen so that every driver doesn't
need to add this stub code just to return -ERANGE.

^ permalink raw reply

* [PATCH 0/4] caching bundles, iteration 5
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras

Changes:
- ops->delete() is now called if flow_cache_genid was changed to
  ensure that resolver does not use stale data
- removed bumping of policy->genid when inserting new policy since
  flow_cache_genid ensures everything is regenerated (thanks Herbert!)
- added unlikely/likely to flow_cache_lookup to favor fast path
  (cache hit)
- added herbert's ack to 1/4 

Compiles, boots and VPN goes up on my test box. Earlier iterations
tested to stay up 3+ days without noticing leaks or other problems.

Timo Teras (4):
  flow: virtualize flow cache entry methods
  xfrm: cache bundles instead of policies for outgoing flows
  xfrm: remove policy garbage collection
  flow: delayed deletion of flow cache entries

 include/net/flow.h      |   23 ++-
 include/net/xfrm.h      |   12 +-
 net/core/flow.c         |  212 ++++++++-----
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  820 +++++++++++++++++++++++++----------------------
 6 files changed, 591 insertions(+), 529 deletions(-)


^ permalink raw reply

* [PATCH 1/4] flow: virtualize flow cache entry methods
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/net/flow.h     |   23 +++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  128 +++++++++++++++++++++++++----------------------
 net/xfrm/xfrm_policy.c |  112 ++++++++++++++++++++++++++++--------------
 4 files changed, 163 insertions(+), 102 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..bb08692 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,26 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
+struct flow_cache_ops;
+
+struct flow_cache_object {
+	const struct flow_cache_ops *ops;
+};
+
+struct flow_cache_ops {
+	struct flow_cache_object *(*get)(struct flow_cache_object *);
+	int (*check)(struct flow_cache_object *);
+	void (*delete)(struct flow_cache_object *);
+};
+
+typedef struct flow_cache_object *(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_object *oldobj, void *ctx);
+
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..35396e2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@ struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_object flo;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..521df52 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,17 +26,16 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry		*next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry **	hash_table;
+	struct flow_cache_entry		**hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -44,7 +43,7 @@ struct flow_cache_percpu {
 };
 
 struct flow_flush_info {
-	struct flow_cache *		cache;
+	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
@@ -52,7 +51,7 @@ struct flow_flush_info {
 struct flow_cache {
 	u32				hash_shift;
 	unsigned long			order;
-	struct flow_cache_percpu *	percpu;
+	struct flow_cache_percpu	*percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
 	if (fle->object)
-		atomic_dec(fle->object_ref);
+		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_object *flo;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	flo = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,27 +198,17 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
-	if (!fle) {
+	if (unlikely(!fle)) {
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
@@ -219,33 +222,39 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			fle->object = NULL;
 			fcp->hash_count++;
 		}
+	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
+	} else if (fle->object) {
+	        flo = fle->object;
+	        flo->ops->delete(flo);
+	        fle->object = NULL;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	flo = NULL;
+	if (fle) {
+		flo = fle->object;
+		fle->object = NULL;
+	}
+	flo = resolver(net, key, family, dir, flo, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (!IS_ERR(flo))
+			fle->object = flo;
+		else
+			fle->genid--;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
 	}
+ret_object:
+	local_bh_enable();
+	return flo;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +270,12 @@ static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
+			if (fle->object)
+				fle->object->ops->delete(fle->object);
 			fle->object = NULL;
-			atomic_dec(fle->object_ref);
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..7722bae 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +688,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +728,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +845,6 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +998,35 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1116,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1601,24 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_object *flo;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(flo);
+		if (IS_ERR(flo)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (flo)
+			policy = container_of(flo, struct xfrm_policy, flo);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1968,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 2/4] xfrm: cache bundles instead of policies for outgoing flows
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

__xfrm_lookup() is called for each packet transmitted out of
system. The xfrm_find_bundle() does a linear search which can
kill system performance depending on how many bundles are
required per policy.

This modifies __xfrm_lookup() to store bundles directly in
the flow cache. If we did not get a hit, we just create a new
bundle instead of doing slow search. This means that we can now
get multiple xfrm_dst's for same flow (on per-cpu basis).

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/xfrm.h      |   10 +-
 net/ipv4/xfrm4_policy.c |   22 --
 net/ipv6/xfrm6_policy.c |   31 --
 net/xfrm/xfrm_policy.c  |  711 +++++++++++++++++++++++++----------------------
 4 files changed, 383 insertions(+), 391 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 35396e2..625dd61 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -267,7 +267,6 @@ struct xfrm_policy_afinfo {
 					       xfrm_address_t *saddr,
 					       xfrm_address_t *daddr);
 	int			(*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr);
-	struct dst_entry	*(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
@@ -483,13 +482,13 @@ struct xfrm_policy {
 	struct timer_list	timer;
 
 	struct flow_cache_object flo;
+	atomic_t		genid;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
 	struct xfrm_selector	selector;
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
-	struct dst_entry       *bundles;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
@@ -879,11 +878,15 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	struct flow_cache_object flo;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int num_pols, num_xfrms;
 #ifdef CONFIG_XFRM_SUB_POLICY
 	struct flowi *origin;
 	struct xfrm_selector *partner;
 #endif
-	u32 genid;
+	u32 xfrm_genid;
+	u32 policy_genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
 	u32 route_cookie;
@@ -893,6 +896,7 @@ struct xfrm_dst {
 #ifdef CONFIG_XFRM
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
+	xfrm_pols_put(xdst->pols, xdst->num_pols);
 	dst_release(xdst->route);
 	if (likely(xdst->u.dst.xfrm))
 		xfrm_state_put(xdst->u.dst.xfrm);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e4a1483..1705476 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -59,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
-		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
-		    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
-		    xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm4_get_tos(struct flowi *fl)
 {
 	return fl->fl4_tos;
@@ -259,7 +238,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.get_saddr =		xfrm4_get_saddr,
-	.find_bundle = 		__xfrm4_find_bundle,
 	.decode_session =	_decode_session4,
 	.get_tos =		xfrm4_get_tos,
 	.init_path =		xfrm4_init_path,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ae18165..8c452fd 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -67,36 +67,6 @@ static int xfrm6_get_saddr(struct net *net,
 	return 0;
 }
 
-static struct dst_entry *
-__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
-	struct dst_entry *dst;
-
-	/* Still not clear if we should set fl->fl6_{src,dst}... */
-	read_lock_bh(&policy->lock);
-	for (dst = policy->bundles; dst; dst = dst->next) {
-		struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
-		struct in6_addr fl_dst_prefix, fl_src_prefix;
-
-		ipv6_addr_prefix(&fl_dst_prefix,
-				 &fl->fl6_dst,
-				 xdst->u.rt6.rt6i_dst.plen);
-		ipv6_addr_prefix(&fl_src_prefix,
-				 &fl->fl6_src,
-				 xdst->u.rt6.rt6i_src.plen);
-		if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
-		    ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
-		    xfrm_bundle_ok(policy, xdst, fl, AF_INET6,
-				   (xdst->u.rt6.rt6i_dst.plen != 128 ||
-				    xdst->u.rt6.rt6i_src.plen != 128))) {
-			dst_clone(dst);
-			break;
-		}
-	}
-	read_unlock_bh(&policy->lock);
-	return dst;
-}
-
 static int xfrm6_get_tos(struct flowi *fl)
 {
 	return 0;
@@ -291,7 +261,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.get_saddr = 		xfrm6_get_saddr,
-	.find_bundle =		__xfrm6_find_bundle,
 	.decode_session =	_decode_session6,
 	.get_tos =		xfrm6_get_tos,
 	.init_path =		xfrm6_init_path,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7722bae..06ccc71 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,6 +37,8 @@
 DEFINE_MUTEX(xfrm_cfg_mutex);
 EXPORT_SYMBOL(xfrm_cfg_mutex);
 
+static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
+static struct dst_entry *xfrm_policy_sk_bundles;
 static DEFINE_RWLOCK(xfrm_policy_lock);
 
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
@@ -50,6 +52,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
+static int stale_bundle(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
@@ -277,8 +280,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 {
 	BUG_ON(!policy->walk.dead);
 
-	BUG_ON(policy->bundles);
-
 	if (del_timer(&policy->timer))
 		BUG();
 
@@ -289,12 +290,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
 
 static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 {
-	struct dst_entry *dst;
-
-	while ((dst = policy->bundles) != NULL) {
-		policy->bundles = dst->next;
-		dst_free(dst);
-	}
+	atomic_inc(&policy->genid);
 
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
@@ -572,7 +568,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct xfrm_policy *delpol;
 	struct hlist_head *chain;
 	struct hlist_node *entry, *newpos;
-	struct dst_entry *gc_list;
 	u32 mark = policy->mark.v & policy->mark.m;
 
 	write_lock_bh(&xfrm_policy_lock);
@@ -622,34 +617,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	else if (xfrm_bydst_should_resize(net, dir, NULL))
 		schedule_work(&net->xfrm.policy_hash_work);
 
-	read_lock_bh(&xfrm_policy_lock);
-	gc_list = NULL;
-	entry = &policy->bydst;
-	hlist_for_each_entry_continue(policy, entry, bydst) {
-		struct dst_entry *dst;
-
-		write_lock(&policy->lock);
-		dst = policy->bundles;
-		if (dst) {
-			struct dst_entry *tail = dst;
-			while (tail->next)
-				tail = tail->next;
-			tail->next = gc_list;
-			gc_list = dst;
-
-			policy->bundles = NULL;
-		}
-		write_unlock(&policy->lock);
-	}
-	read_unlock_bh(&xfrm_policy_lock);
-
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-
-		gc_list = dst->next;
-		dst_free(dst);
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
@@ -998,6 +965,19 @@ fail:
 	return ret;
 }
 
+static struct xfrm_policy *
+__xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_policy *pol;
+
+	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+	if (pol != NULL)
+		return pol;
+#endif
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+}
+
 static struct flow_cache_object *
 xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
 		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
@@ -1007,21 +987,10 @@ xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
 	if (old_obj)
 		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
-#ifdef CONFIG_XFRM_SUB_POLICY
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol))
+	pol = __xfrm_policy_lookup(net, fl, family, dir);
+	if (IS_ERR_OR_NULL(pol))
 		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-#endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol))
-		return ERR_CAST(pol);
-	if (pol)
-		goto found;
-	return NULL;
 
-found:
 	/* Resolver returns two references:
 	 * one for cache and one for caller of flow_cache_lookup() */
 	xfrm_pol_hold(pol);
@@ -1313,18 +1282,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
  * still valid.
  */
 
-static struct dst_entry *
-xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
-{
-	struct dst_entry *x;
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	if (unlikely(afinfo == NULL))
-		return ERR_PTR(-EINVAL);
-	x = afinfo->find_bundle(fl, policy);
-	xfrm_policy_put_afinfo(afinfo);
-	return x;
-}
-
 static inline int xfrm_get_tos(struct flowi *fl, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1340,6 +1297,54 @@ static inline int xfrm_get_tos(struct flowi *fl, int family)
 	return tos;
 }
 
+static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (xdst->route == NULL) {
+		/* Dummy bundle - if it has xfrms we were not
+		 * able to build bundle as template resolution failed.
+		 * It means we need to try again resolving. */
+		if (xdst->num_xfrms > 0)
+			return NULL;
+	} else {
+		/* Real bundle */
+		if (stale_bundle(dst))
+			return NULL;
+	}
+
+	dst_hold(dst);
+	return flo;
+}
+
+static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (!xdst->route)
+		return 0;
+	if (stale_bundle(dst))
+		return 0;
+
+	return 1;
+}
+
+static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	dst_free(dst);
+}
+
+static const struct flow_cache_ops xfrm_bundle_fc_ops = {
+	.get = xfrm_bundle_flo_get,
+	.check = xfrm_bundle_flo_check,
+	.delete = xfrm_bundle_flo_delete,
+};
+
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1362,9 +1367,10 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 		BUG();
 	}
 	xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
-
 	xfrm_policy_put_afinfo(afinfo);
 
+	xdst->flo.ops = &xfrm_bundle_fc_ops;
+
 	return xdst;
 }
 
@@ -1402,6 +1408,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	return err;
 }
 
+
 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
  * all the metrics... Shortly, bundle a bundle.
  */
@@ -1465,7 +1472,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 			dst_hold(dst);
 
 		dst1->xfrm = xfrm[i];
-		xdst->genid = xfrm[i]->genid;
+		xdst->xfrm_genid = xfrm[i]->genid;
 
 		dst1->obsolete = -1;
 		dst1->flags |= DST_HOST;
@@ -1558,7 +1565,186 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
+static int xfrm_expand_policies(struct flowi *fl, u16 family,
+				struct xfrm_policy **pols,
+				int *num_pols, int *num_xfrms)
+{
+	int i;
+
+	if (*num_pols == 0 || !pols[0]) {
+		*num_pols = 0;
+		*num_xfrms = 0;
+		return 0;
+	}
+	if (IS_ERR(pols[0]))
+		return PTR_ERR(pols[0]);
+
+	*num_xfrms = pols[0]->xfrm_nr;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
+	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
+						    XFRM_POLICY_TYPE_MAIN,
+						    fl, family,
+						    XFRM_POLICY_OUT);
+		if (pols[1]) {
+			if (IS_ERR(pols[1])) {
+				xfrm_pols_put(pols, *num_pols);
+				return PTR_ERR(pols[1]);
+			}
+			(*num_pols) ++;
+			(*num_xfrms) += pols[1]->xfrm_nr;
+		}
+	}
+#endif
+	for (i = 0; i < *num_pols; i++) {
+		if (pols[i]->action != XFRM_POLICY_ALLOW) {
+			*num_xfrms = -1;
+			break;
+		}
+	}
+
+	return 0;
+
+}
+
+static struct xfrm_dst *
+xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
+			       struct flowi *fl, u16 family,
+			       struct dst_entry *dst_orig)
+{
+	struct net *net = xp_net(pols[0]);
+	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct dst_entry *dst;
+	struct xfrm_dst *xdst;
+	int err;
+
+	/* Try to instantiate a bundle */
+	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
+	if (err < 0) {
+		if (err != -EAGAIN)
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	if (IS_ERR(dst)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+		return ERR_CAST(dst);
+	}
+
+	xdst = (struct xfrm_dst *)dst;
+	xdst->num_xfrms = err;
+	if (num_pols > 1)
+		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+	else
+		err = xfrm_dst_update_origin(dst, fl);
+	if (unlikely(err)) {
+		dst_free(dst);
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+		return ERR_PTR(err);
+	}
+
+	xdst->num_pols = num_pols;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+	xdst->policy_genid = atomic_read(&pols[0]->genid);
+
+	return xdst;
+}
+
+static struct flow_cache_object *
+xfrm_bundle_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir,
+		   struct flow_cache_object *oldflo, void *ctx)
+{
+	struct dst_entry *dst_orig = (struct dst_entry *)ctx;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	struct xfrm_dst *xdst, *new_xdst;
+	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+
+	/* Check if the policies from old bundle are usable */
+	xdst = NULL;
+	if (oldflo) {
+		xdst = container_of(oldflo, struct xfrm_dst, flo);
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		pol_dead = 0;
+		for (i = 0; i < num_pols; i++) {
+			pols[i] = xdst->pols[i];
+			pol_dead |= pols[i]->walk.dead;
+		}
+		if (pol_dead) {
+			dst_free(&xdst->u.dst);
+			xdst = NULL;
+			num_pols = 0;
+			num_xfrms = 0;
+			oldflo = NULL;
+		}
+	}
+
+	/* Resolve policies to use if we couldn't get them from
+	 * previous cache entry */
+	if (xdst == NULL) {
+		num_pols = 1;
+		pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
+			goto inc_error;
+		if (num_pols == 0)
+			return NULL;
+		if (num_xfrms <= 0)
+			goto make_dummy_bundle;
+	}
+
+	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
+	if (IS_ERR(new_xdst)) {
+		err = PTR_ERR(new_xdst);
+		if (err != -EAGAIN)
+			goto error;
+		if (oldflo == NULL)
+			goto make_dummy_bundle;
+		dst_hold(&xdst->u.dst);
+		return oldflo;
+	}
+
+	/* Kill the previous bundle */
+	if (xdst) {
+		/* The policies were stolen for newly generated bundle */
+		xdst->num_pols = 0;
+		dst_free(&xdst->u.dst);
+	}
+
+	/* Flow cache does not have reference, it dst_free()'s,
+	 * but we do need to return one reference for original caller */
+	dst_hold(&new_xdst->u.dst);
+	return &new_xdst->flo;
+
+make_dummy_bundle:
+	/* We found policies, but there's no bundles to instantiate:
+	 * either because the policy blocks, has no transformations or
+	 * we could not build template (no xfrm_states).*/
+	xdst = xfrm_alloc_dst(net, family);
+	if (IS_ERR(xdst)) {
+		xfrm_pols_put(pols, num_pols);
+		return ERR_CAST(xdst);
+	}
+	xdst->num_pols = num_pols;
+	xdst->num_xfrms = num_xfrms;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+
+	dst_hold(&xdst->u.dst);
+	return &xdst->flo;
+
+inc_error:
+	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+error:
+	if (xdst != NULL)
+		dst_free(&xdst->u.dst);
+	else
+		xfrm_pols_put(pols, num_pols);
+	return ERR_PTR(err);
+}
 
 /* Main function: finds/creates a bundle for given flow.
  *
@@ -1568,248 +1754,152 @@ static int stale_bundle(struct dst_entry *dst);
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
-	struct xfrm_policy *policy;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-	int npols;
-	int pol_dead;
-	int xfrm_nr;
-	int pi;
-	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
-	struct dst_entry *dst, *dst_orig = *dst_p;
-	int nx = 0;
-	int err;
-	u32 genid;
-	u16 family;
+	struct flow_cache_object *flo;
+	struct xfrm_dst *xdst;
+	struct dst_entry *dst, *dst_orig = *dst_p, *route;
+	u16 family = dst_orig->ops->family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	int i, err, num_pols, num_xfrms, drop_pols = 0;
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
-	policy = NULL;
-	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
-		pols[pi] = NULL;
-	npols = 0;
-	pol_dead = 0;
-	xfrm_nr = 0;
+	dst = NULL;
+	xdst = NULL;
+	route = NULL;
 
 	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		num_pols = 1;
+		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
 			goto dropdst;
+
+		if (num_pols) {
+			if (num_xfrms <= 0) {
+				drop_pols = num_pols;
+				goto no_transform;
+			}
+
+			xdst = xfrm_resolve_and_create_bundle(
+					pols, num_pols, fl,
+					family, dst_orig);
+			if (IS_ERR(xdst)) {
+				xfrm_pols_put(pols, num_pols);
+				err = PTR_ERR(xdst);
+				goto dropdst;
+			}
+
+			spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+			xdst->u.dst.next = xfrm_policy_sk_bundles;
+			xfrm_policy_sk_bundles = &xdst->u.dst;
+			spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+			route = xdst->route;
 		}
 	}
 
-	if (!policy) {
-		struct flow_cache_object *flo;
-
+	if (xdst == NULL) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					dir, xfrm_policy_lookup, NULL);
-		err = PTR_ERR(flo);
+		flo = flow_cache_lookup(net, fl, family, dir,
+					xfrm_bundle_lookup, dst_orig);
+		if (flo == NULL)
+			goto nopol;
 		if (IS_ERR(flo)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+			err = PTR_ERR(flo);
 			goto dropdst;
 		}
-		if (flo)
-			policy = container_of(flo, struct xfrm_policy, flo);
-		else
-			policy = NULL;
+		xdst = container_of(flo, struct xfrm_dst, flo);
+
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
+		route = xdst->route;
+	}
+
+	dst = &xdst->u.dst;
+	if (route == NULL && num_xfrms > 0) {
+		/* The only case when xfrm_bundle_lookup() returns a
+		 * bundle with null route, is when the template could
+		 * not be resolved. It means policies are there, but
+		 * bundle could not be created, since we don't yet
+		 * have the xfrm_state's. We need to wait for KM to
+		 * negotiate new SA's or bail out with error.*/
+		if (net->xfrm.sysctl_larval_drop) {
+			/* EREMOTE tells the caller to generate
+			 * a one-shot blackhole route. */
+			dst_release(dst);
+			xfrm_pols_put(pols, num_pols);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+			return -EREMOTE;
+		}
+		if (flags & XFRM_LOOKUP_WAIT) {
+			DECLARE_WAITQUEUE(wait, current);
+
+			add_wait_queue(&net->xfrm.km_waitq, &wait);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&net->xfrm.km_waitq, &wait);
+
+			if (!signal_pending(current)) {
+				dst_release(dst);
+				goto restart;
+			}
+
+			err = -ERESTART;
+		} else
+			err = -EAGAIN;
+
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+		goto error;
 	}
 
-	if (!policy)
+no_transform:
+	if (num_pols == 0)
 		goto nopol;
 
-	family = dst_orig->ops->family;
-	pols[0] = policy;
-	npols ++;
-	xfrm_nr += pols[0]->xfrm_nr;
-
-	err = -ENOENT;
-	if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP))
+	if ((flags & XFRM_LOOKUP_ICMP) &&
+	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
+		err = -ENOENT;
 		goto error;
+	}
 
-	policy->curlft.use_time = get_seconds();
+	for (i = 0; i < num_pols; i++)
+		pols[i]->curlft.use_time = get_seconds();
 
-	switch (policy->action) {
-	default:
-	case XFRM_POLICY_BLOCK:
+	if (num_xfrms < 0) {
 		/* Prohibit the flow */
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
 		err = -EPERM;
 		goto error;
-
-	case XFRM_POLICY_ALLOW:
-#ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
-#endif
-
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
-		dst = xfrm_find_bundle(fl, policy, family);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = PTR_ERR(dst);
-			goto error;
-		}
-
-		if (dst)
-			break;
-
-#ifdef CONFIG_XFRM_SUB_POLICY
-		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
-			pols[1] = xfrm_policy_lookup_bytype(net,
-							    XFRM_POLICY_TYPE_MAIN,
-							    fl, family,
-							    XFRM_POLICY_OUT);
-			if (pols[1]) {
-				if (IS_ERR(pols[1])) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-					err = PTR_ERR(pols[1]);
-					goto error;
-				}
-				if (pols[1]->action == XFRM_POLICY_BLOCK) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
-					err = -EPERM;
-					goto error;
-				}
-				npols ++;
-				xfrm_nr += pols[1]->xfrm_nr;
-			}
-		}
-
-		/*
-		 * Because neither flowi nor bundle information knows about
-		 * transformation template size. On more than one policy usage
-		 * we can realize whether all of them is bypass or not after
-		 * they are searched. See above not-transformed bypass
-		 * is surrounded by non-sub policy configuration, too.
-		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-#endif
-		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-		if (unlikely(nx<0)) {
-			err = nx;
-			if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) {
-				/* EREMOTE tells the caller to generate
-				 * a one-shot blackhole route.
-				 */
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				xfrm_pol_put(policy);
-				return -EREMOTE;
-			}
-			if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) {
-				DECLARE_WAITQUEUE(wait, current);
-
-				add_wait_queue(&net->xfrm.km_waitq, &wait);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule();
-				set_current_state(TASK_RUNNING);
-				remove_wait_queue(&net->xfrm.km_waitq, &wait);
-
-				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
-
-				if (nx == -EAGAIN && signal_pending(current)) {
-					XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-					err = -ERESTART;
-					goto error;
-				}
-				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
-					xfrm_pols_put(pols, npols);
-					goto restart;
-				}
-				err = nx;
-			}
-			if (err < 0) {
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
-				goto error;
-			}
-		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
-		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
-		err = PTR_ERR(dst);
-		if (IS_ERR(dst)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
-			goto error;
-		}
-
-		for (pi = 0; pi < npols; pi++)
-			pol_dead |= pols[pi]->walk.dead;
-
-		write_lock_bh(&policy->lock);
-		if (unlikely(pol_dead || stale_bundle(dst))) {
-			/* Wow! While we worked on resolving, this
-			 * policy has gone. Retry. It is not paranoia,
-			 * we just cannot enlist new bundle to dead object.
-			 * We can't enlist stable bundles either.
-			 */
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-
-			if (pol_dead)
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD);
-			else
-				XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			err = -EHOSTUNREACH;
-			goto error;
-		}
-
-		if (npols > 1)
-			err = xfrm_dst_update_parent(dst, &pols[1]->selector);
-		else
-			err = xfrm_dst_update_origin(dst, fl);
-		if (unlikely(err)) {
-			write_unlock_bh(&policy->lock);
-			dst_free(dst);
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
-			goto error;
-		}
-
-		dst->next = policy->bundles;
-		policy->bundles = dst;
-		dst_hold(dst);
-		write_unlock_bh(&policy->lock);
+	} else if (num_xfrms > 0) {
+		/* Flow transformed */
+		*dst_p = dst;
+		dst_release(dst_orig);
+	} else {
+		/* Flow passes untransformed */
+		dst_release(dst);
 	}
-	*dst_p = dst;
-	dst_release(dst_orig);
-	xfrm_pols_put(pols, npols);
+ok:
+	xfrm_pols_put(pols, drop_pols);
 	return 0;
 
+nopol:
+	if (!(flags & XFRM_LOOKUP_ICMP))
+		goto ok;
+	err = -ENOENT;
 error:
-	xfrm_pols_put(pols, npols);
+	dst_release(dst);
 dropdst:
 	dst_release(dst_orig);
 	*dst_p = NULL;
+	xfrm_pols_put(pols, drop_pols);
 	return err;
-
-nopol:
-	err = -ENOENT;
-	if (flags & XFRM_LOOKUP_ICMP)
-		goto dropdst;
-	return 0;
 }
 EXPORT_SYMBOL(__xfrm_lookup);
 
@@ -2161,71 +2251,24 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
-{
-	struct dst_entry *dst, **dstp;
-
-	write_lock(&pol->lock);
-	dstp = &pol->bundles;
-	while ((dst=*dstp) != NULL) {
-		if (func(dst)) {
-			*dstp = dst->next;
-			dst->next = *gc_list_p;
-			*gc_list_p = dst;
-		} else {
-			dstp = &dst->next;
-		}
-	}
-	write_unlock(&pol->lock);
-}
-
-static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
+static void __xfrm_garbage_collect(struct net *net)
 {
-	struct dst_entry *gc_list = NULL;
-	int dir;
+	struct dst_entry *head, *next;
 
-	read_lock_bh(&xfrm_policy_lock);
-	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
-		struct xfrm_policy *pol;
-		struct hlist_node *entry;
-		struct hlist_head *table;
-		int i;
+	flow_cache_flush();
 
-		hlist_for_each_entry(pol, entry,
-				     &net->xfrm.policy_inexact[dir], bydst)
-			prune_one_bundle(pol, func, &gc_list);
+	spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+	head = xfrm_policy_sk_bundles;
+	xfrm_policy_sk_bundles = NULL;
+	spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
 
-		table = net->xfrm.policy_bydst[dir].table;
-		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
-			hlist_for_each_entry(pol, entry, table + i, bydst)
-				prune_one_bundle(pol, func, &gc_list);
-		}
-	}
-	read_unlock_bh(&xfrm_policy_lock);
-
-	while (gc_list) {
-		struct dst_entry *dst = gc_list;
-		gc_list = dst->next;
-		dst_free(dst);
+	while (head) {
+		next = head->next;
+		dst_free(head);
+		head = next;
 	}
 }
 
-static int unused_bundle(struct dst_entry *dst)
-{
-	return !atomic_read(&dst->__refcnt);
-}
-
-static void __xfrm_garbage_collect(struct net *net)
-{
-	xfrm_prune_bundles(net, unused_bundle);
-}
-
-static int xfrm_flush_bundles(struct net *net)
-{
-	xfrm_prune_bundles(net, stale_bundle);
-	return 0;
-}
-
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -2283,7 +2326,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
-		if (xdst->genid != dst->xfrm->genid)
+		if (xdst->xfrm_genid != dst->xfrm->genid)
+			return 0;
+		if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
 			return 0;
 
 		if (strict && fl &&
@@ -2448,7 +2493,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
 
 	switch (event) {
 	case NETDEV_DOWN:
-		xfrm_flush_bundles(dev_net(dev));
+		__xfrm_garbage_collect(dev_net(dev));
 	}
 	return NOTIFY_DONE;
 }
@@ -2780,7 +2825,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2805,10 +2849,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			atomic_inc(&pol->genid);
 		}
 	}
 
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 3/4] xfrm: remove policy garbage collection
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

Policies are now properly reference counted and destroyed from
all code paths. The delayed gc is just an overhead now and can
be removed.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/xfrm/xfrm_policy.c |   39 +++++----------------------------------
 1 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 06ccc71..7430ac2 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,9 +46,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
-static HLIST_HEAD(xfrm_policy_gc_list);
-static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
-
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
@@ -288,32 +285,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
 
-static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
-{
-	atomic_inc(&policy->genid);
-
-	if (del_timer(&policy->timer))
-		atomic_dec(&policy->refcnt);
-
-	xfrm_pol_put(policy);
-}
-
-static void xfrm_policy_gc_task(struct work_struct *work)
-{
-	struct xfrm_policy *policy;
-	struct hlist_node *entry, *tmp;
-	struct hlist_head gc_list;
-
-	spin_lock_bh(&xfrm_policy_gc_lock);
-	gc_list.first = xfrm_policy_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
-	spin_unlock_bh(&xfrm_policy_gc_lock);
-
-	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
-		xfrm_policy_gc_kill(policy);
-}
-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
-
 /* Rule must be locked. Release descentant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
  */
@@ -322,11 +293,12 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
 	policy->walk.dead = 1;
 
-	spin_lock_bh(&xfrm_policy_gc_lock);
-	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
-	spin_unlock_bh(&xfrm_policy_gc_lock);
+	atomic_inc(&policy->genid);
 
-	schedule_work(&xfrm_policy_gc_work);
+	if (del_timer(&policy->timer))
+		xfrm_pol_put(policy);
+
+	xfrm_pol_put(policy);
 }
 
 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
@@ -2599,7 +2571,6 @@ static void xfrm_policy_fini(struct net *net)
 	audit_info.sessionid = -1;
 	audit_info.secid = 0;
 	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
-	flush_work(&xfrm_policy_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.policy_all));
 
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH 4/4] flow: delayed deletion of flow cache entries
From: Timo Teras @ 2010-04-07 10:30 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Timo Teras
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

Speed up lookups by freeing flow cache entries later. After
virtualizing flow cache entry operations, the flow cache may now
end up calling policy or bundle destructor which can be slowish.

As gc_list is more effective with double linked list, the flow cache
is converted to use common hlist and list macroes where appropriate.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 net/core/flow.c |  100 ++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/net/core/flow.c b/net/core/flow.c
index 521df52..1619006 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,7 +26,10 @@
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry		*next;
+	union {
+		struct hlist_node	hlist;
+		struct list_head	gc_list;
+	} u;
 	u16				family;
 	u8				dir;
 	u32				genid;
@@ -35,7 +38,7 @@ struct flow_cache_entry {
 };
 
 struct flow_cache_percpu {
-	struct flow_cache_entry		**hash_table;
+	struct hlist_head		*hash_table;
 	int				hash_count;
 	u32				hash_rnd;
 	int				hash_rnd_recalc;
@@ -62,6 +65,9 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0);
 static struct flow_cache flow_cache_global;
 static struct kmem_cache *flow_cachep;
 
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
+static LIST_HEAD(flow_cache_gc_list);
+
 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
@@ -86,38 +92,66 @@ static int flow_entry_valid(struct flow_cache_entry *fle)
 	return 1;
 }
 
-static void flow_entry_kill(struct flow_cache *fc,
-			    struct flow_cache_percpu *fcp,
-			    struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle)
 {
 	if (fle->object)
 		fle->object->ops->delete(fle->object);
 	kmem_cache_free(flow_cachep, fle);
-	fcp->hash_count--;
+}
+
+static void flow_cache_gc_task(struct work_struct *work)
+{
+	struct list_head gc_list;
+	struct flow_cache_entry *fce, *n;
+
+	INIT_LIST_HEAD(&gc_list);
+	spin_lock_bh(&flow_cache_gc_lock);
+	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&flow_cache_gc_lock);
+
+	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+		flow_entry_kill(fce);
+}
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+				     int deleted, struct list_head *gc_list)
+{
+	if (deleted) {
+		fcp->hash_count -= deleted;
+		spin_lock_bh(&flow_cache_gc_lock);
+		list_splice_tail(gc_list, &flow_cache_gc_list);
+		spin_unlock_bh(&flow_cache_gc_lock);
+		schedule_work(&flow_cache_gc_work);
+	}
 }
 
 static void __flow_cache_shrink(struct flow_cache *fc,
 				struct flow_cache_percpu *fcp,
 				int shrink_to)
 {
-	struct flow_cache_entry *fle, **flp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int saved = 0;
 
-		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (saved < shrink_to &&
 			    flow_entry_valid(fle)) {
 				saved++;
-				flp = &fle->next;
 			} else {
-				*flp = fle->next;
-				flow_entry_kill(fc, fcp, fle);
+				deleted++;
+				hlist_del(&fle->u.hlist);
+				list_add_tail(&fle->u.gc_list, &gc_list);
 			}
 		}
 	}
+
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
 }
 
 static void flow_cache_shrink(struct flow_cache *fc,
@@ -182,7 +216,8 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry *fle, *tfle;
+	struct hlist_node *entry;
 	struct flow_cache_object *flo;
 	unsigned int hash;
 
@@ -200,12 +235,13 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 		flow_new_hash_rnd(fc, fcp);
 
 	hash = flow_hash_code(fc, fcp, key);
-	head = &fcp->hash_table[hash];
-	for (fle = *head; fle; fle = fle->next) {
-		if (fle->family == family &&
-		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0)
+	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
+		if (tfle->family == family &&
+		    tfle->dir == dir &&
+		    flow_key_compare(key, &tfle->key) == 0) {
+			fle = tfle;
 			break;
+		}
 	}
 
 	if (unlikely(!fle)) {
@@ -214,12 +250,11 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
 		if (fle) {
-			fle->next = *head;
-			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
+			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 			fcp->hash_count++;
 		}
 	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
@@ -262,23 +297,26 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	struct flow_flush_info *info = (void *)data;
 	struct flow_cache *fc = info->cache;
 	struct flow_cache_percpu *fcp;
-	int i;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
 
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		struct flow_cache_entry *fle;
-
-		fle = fcp->hash_table[i];
-		for (; fle; fle = fle->next) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
 			if (flow_entry_valid(fle))
 				continue;
 
-			if (fle->object)
-				fle->object->ops->delete(fle->object);
-			fle->object = NULL;
+			deleted++;
+			hlist_del(&fle->u.hlist);
+			list_add_tail(&fle->u.gc_list, &gc_list);
 		}
 	}
 
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
@@ -320,7 +358,7 @@ void flow_cache_flush(void)
 static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
 					  struct flow_cache_percpu *fcp)
 {
-	fcp->hash_table = (struct flow_cache_entry **)
+	fcp->hash_table = (struct hlist_head *)
 		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
 	if (!fcp->hash_table)
 		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
@@ -354,7 +392,7 @@ static int flow_cache_init(struct flow_cache *fc)
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc));
+		     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
 	fc->order = order;
-- 
1.6.3.3


^ permalink raw reply related

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Pekka Enberg @ 2010-04-07 10:47 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: Eric Dumazet, Christoph Lameter, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton, mingo
In-Reply-To: <1270607668.2078.259.camel@ymzhang.sh.intel.com>

Zhang, Yanmin kirjoitti:
> Kernel 2.6.34-rc3:
> # Samples: 13079611308 LLC-load-misses
> #
> # Overhead          Command                                                         Shared Object  Symbol
> # ........  ...............  ....................................................................  ......
> #
>     18.55%        hackbench  [kernel.kallsyms]                                                     [k] copy_user_generic_str
> ing
>     13.19%        hackbench  [kernel.kallsyms]                                                     [k] unix_stream_recvmsg
>     11.62%        hackbench  [kernel.kallsyms]                                                     [k] kfree
>      8.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_free
>      7.88%        hackbench  [kernel.kallsyms]                                                     [k] __kmalloc_node_track_
> caller
>      6.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_alloc_node
>      5.94%        hackbench  [kernel.kallsyms]                                                     [k] kfree_skb
>      3.48%        hackbench  [kernel.kallsyms]                                                     [k] __slab_free
>      2.15%        hackbench  [kernel.kallsyms]                                                     [k] _raw_spin_lock
>      1.83%        hackbench  [kernel.kallsyms]                                                     [k] schedule
>      1.82%        hackbench  [kernel.kallsyms]                                                     [k] get_partial_node
>      1.59%        hackbench  hackbench                                                             [.] receiver
>      1.37%        hackbench  libpthread-2.9.so                                                     [.] __read

Btw, you might want to try out "perf record -g" and "perf report 
--callchain fractal,5" to get a better view of where we're spending 
time. Perhaps you can spot the difference with that more easily.

^ permalink raw reply

* Re: [PATCH 0/4] caching bundles, iteration 5
From: David Miller @ 2010-04-07 10:52 UTC (permalink / raw)
  To: timo.teras; +Cc: netdev, herbert
In-Reply-To: <1270636207-20933-1-git-send-email-timo.teras@iki.fi>

From: Timo Teras <timo.teras@iki.fi>
Date: Wed,  7 Apr 2010 13:30:03 +0300

> Changes:
> - ops->delete() is now called if flow_cache_genid was changed to
>   ensure that resolver does not use stale data
> - removed bumping of policy->genid when inserting new policy since
>   flow_cache_genid ensures everything is regenerated (thanks Herbert!)
> - added unlikely/likely to flow_cache_lookup to favor fast path
>   (cache hit)
> - added herbert's ack to 1/4 
> 
> Compiles, boots and VPN goes up on my test box. Earlier iterations
> tested to stay up 3+ days without noticing leaks or other problems.

Applied to net-next-2.6 and going through some built tests before
I push it out, thanks!

^ permalink raw reply

* [PATCH 0/1]qlcnic: fix set mac addr
From: Amit Kumar Salecha @ 2010-04-07 11:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, ameen.rahman

Hi
  Sending 1 important patch to fix bonding enviornment.
  Mac addresses are not correctly communicated/set to fw, hence fw drops
  packets as mac address doesn't match.

  This fix need to be included in net-2.6 branch.

-Amit

^ permalink raw reply

* [PATCH 1/1] qlcnic: fix set mac addr
From: Amit Kumar Salecha @ 2010-04-07 11:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, ameen.rahman
In-Reply-To: <1270638114-15323-1-git-send-email-amit.salecha@qlogic.com>

If interface is down, mac address request are not sent to fw
but it is getting add in driver mac list.
Driver mac list should be in sync with fw i.e addresses communicated
to fw.

Signed-off-by: Amit Kumar Salecha <amit.salecha@qlogic.com>
---
 drivers/net/qlcnic/qlcnic_hw.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic_hw.c b/drivers/net/qlcnic/qlcnic_hw.c
index da00e16..b175313 100644
--- a/drivers/net/qlcnic/qlcnic_hw.c
+++ b/drivers/net/qlcnic/qlcnic_hw.c
@@ -430,6 +430,9 @@ void qlcnic_set_multi(struct net_device *netdev)
 	u8 bcast_addr[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 	u32 mode = VPORT_MISS_MODE_DROP;
 
+	if (adapter->is_up != QLCNIC_ADAPTER_UP_MAGIC)
+		return;
+
 	qlcnic_nic_add_mac(adapter, adapter->mac_addr);
 	qlcnic_nic_add_mac(adapter, bcast_addr);
 
-- 
1.6.0.2


^ permalink raw reply related

* Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net
From: Michael S. Tsirkin @ 2010-04-07 10:59 UTC (permalink / raw)
  To: David L Stevens; +Cc: kvm, netdev, rusty, virtualization
In-Reply-To: <1270585973.28407.3.camel@lab1.dls>

On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
> 
> This patch adds support for the Mergeable Receive Buffers feature to
> vhost_net.
> 
> 						+-DLS
> 
> Changes from previous revision:
> 1) renamed:
> 	vhost_discard_vq_desc -> vhost_discard_desc
> 	vhost_get_heads -> vhost_get_desc_n
> 	vhost_get_vq_desc -> vhost_get_desc
> 2) added heads as argument to ghost_get_desc_n
> 3) changed "vq->heads" from iovec to vring_used_elem, removed casts
> 4) changed vhost_add_used to do multiple elements in a single
> copy_to_user,
> 	or two when we wrap the ring.
> 5) removed rxmaxheadcount and available buffer checks in favor of
> running until
> 	an allocation failure, but making sure we break the loop if we get
> 	two in a row, indicating we have at least 1 buffer, but not enough
> 	for the current receive packet
> 6) restore non-vnet header handling
> 
> Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>

Thanks!
There's some whitespace damage, are you sending with your new
sendmail setup? It seems to have worked for qemu patches ...

> diff -ruNp net-next-p0/drivers/vhost/net.c
> net-next-v3/drivers/vhost/net.c
> --- net-next-p0/drivers/vhost/net.c	2010-03-22 12:04:38.000000000 -0700
> +++ net-next-v3/drivers/vhost/net.c	2010-04-06 12:54:56.000000000 -0700
> @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
>  	hdr_size = vq->hdr_size;
>  
>  	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> +					 ARRAY_SIZE(vq->iov), &out, &in,
>  					 NULL, NULL);
>  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
>  		if (head == vq->num) {
> @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq);
> -			tx_poll_start(net, sock);
> +			if (err == -EAGAIN) {
> +				vhost_discard_desc(vq, 1);
> +				tx_poll_start(net, sock);
> +			} else {
> +				vq_err(vq, "sendmsg: errno %d\n", -err);
> +				/* drop packet; do not discard/resend */
> +				vhost_add_used_and_signal(&net->dev, vq, head,
> +							  0);

vhost does not currently has a consistent error handling strategy:
if we drop packets, need to think which other errors should cause
packet drops.  I prefer to just call vq_err for now,
and have us look at handling segfaults etc in a consistent way
separately.

> +			}
>  			break;
>  		}
>  		if (err != len)
> @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
>  	unuse_mm(net->dev.mm);
>  }
>  
> +static int vhost_head_len(struct sock *sk)
> +{
> +	struct sk_buff *head;
> +	int len = 0;
> +
> +	lock_sock(sk);
> +	head = skb_peek(&sk->sk_receive_queue);
> +	if (head)
> +		len = head->len;
> +	release_sock(sk);
> +	return len;
> +}
> +

I wonder whether it makes sense to check
skb_queue_empty(&sk->sk_receive_queue)
outside the lock, to reduce the cost of this call
on an empty queue (we know that it happens at least once
each time we exit the loop on rx)?

>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -	unsigned head, out, in, log, s;
> +	unsigned in, log, s;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
> @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
>  		.msg_flags = MSG_DONTWAIT,
>  	};
>  
> -	struct virtio_net_hdr hdr = {
> -		.flags = 0,
> -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> +		.hdr.flags = 0,
> +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
>  	};
>  
> +	int retries = 0;
>  	size_t len, total_len = 0;
> -	int err;
> +	int err, headcount, datalen;
>  	size_t hdr_size;
>  	struct socket *sock = rcu_dereference(vq->private_data);
>  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> +					     vq_log, &log);

This looks like a bug, I think we need to pass
datalen + header size to vhost_get_desc_n.
Not sure how we know the header size that backend will use though.
Maybe just look at our features.

>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> -			if (unlikely(vhost_enable_notify(vq))) {
> +		if (!headcount) {
> +			if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */
>  				vhost_disable_notify(vq);
> +				retries++;
>  				continue;
>  			}

Hmm. The reason we have the code at all, as the comment says, is because
guest could have added more buffers between the time we read last index
and the time we enabled notification. So if we just break like this
the race still exists. We could remember the
last head value we observed, and have vhost_enable_notify check
against this value?

Need to think about it.

Another concern here is that on retries vhost_get_desc_n
is doing extra work, rescanning the same descriptor
again and again. Not sure how common this is, might be
worthwhile to add a TODO to consider this at least.

> +			retries = 0;
>  			/* Nothing new?  Wait for eventfd to tell us
>  			 * they refilled. */
>  			break;
>  		}
>  		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> +		/* Skip header. TODO: support TSO. */
>  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
> @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {

I think we need to compare err and datalen and drop packet on mismatch as well.
The check err > len won't be needed then.

> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			break;
>  		}
>  		/* TODO: Should check and handle checksum. */
> +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> +			struct virtio_net_hdr_mrg_rxbuf *vhdr =
> +				(struct virtio_net_hdr_mrg_rxbuf *)
> +				vq->iov[0].iov_base;
> +			/* add num_buffers */
> +			if (vhost_has_feature(&net->dev,
> +					      VHOST_NET_F_VIRTIO_NET_HDR))
> +				hdr.num_buffers = headcount;

Why is the above necessary?

> +			else if (vq->iov[0].iov_len < sizeof(*vhdr)) {

I think length check is already done when we copy the header. No?

> +				vq_err(vq, "tiny buffers < %d unsupported",
> +					vq->iov[0].iov_len);
> +				vhost_discard_desc(vq, headcount);
> +				break;

Problem here is that recvmsg might modify iov.
This is why I think we need to use vq->hdr here.

> +			} else if (put_user(headcount, &vhdr->num_buffers)) {

The above put_user writes out a 32 bit value, right?
This seems wrong.

How about using
	memcpy_toiovecend(vq->hdr, &headcount,
			  offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
			  sizeof headcount);

this way we also do not make any assumptions about layout.

> +				vq_err(vq, "Failed num_buffers write");
> +				vhost_discard_desc(vq, headcount);
> +				break;
> +			}
> +		}
>  		if (err > len) {
>  			pr_err("Discarded truncated rx packet: "
>  			       " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			continue;
>  		}
>  		len = err;
> @@ -279,7 +312,7 @@ static void handle_rx(struct vhost_net *
>  			break;
>  		}
>  		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount);
>  		if (unlikely(vq_log))
>  			vhost_log_write(vq, vq_log, log, len);
>  		total_len += len;
> @@ -560,9 +593,14 @@ done:
>  
>  static int vhost_net_set_features(struct vhost_net *n, u64 features)
>  {
> -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> -		sizeof(struct virtio_net_hdr) : 0;
> +	size_t hdr_size = 0;
>  	int i;
> +
> +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> +		hdr_size = sizeof(struct virtio_net_hdr);
> +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> +			hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);

My personal style for this would be:
  	if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
		hdr_size = 0
	else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
		hdr_size = sizeof(virtio_net_hdr);
	else
		hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);

which results in more symmetry and less nesting.

>  	mutex_lock(&n->dev.mutex);
>  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
>  	    !vhost_log_access_ok(&n->dev)) {
> diff -ruNp net-next-p0/drivers/vhost/vhost.c
> net-next-v3/drivers/vhost/vhost.c
> --- net-next-p0/drivers/vhost/vhost.c	2010-03-22 12:04:38.000000000
> -0700
> +++ net-next-v3/drivers/vhost/vhost.c	2010-04-06 12:57:51.000000000
> -0700
> @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
>  	return 0;
>  }
>  
> +/* This is a multi-buffer version of vhost_get_vq_desc
> + * @vq		- the relevant virtqueue
> + * datalen	- data length we'll be reading
> + * @iovcount	- returned count of io vectors we fill
> + * @log		- vhost log
> + * @log_num	- log offset
> + *	returns number of buffer heads allocated, 0 on error

This is unusual. Let's return a negative error code on error.

> + */
> +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem
> *heads,
> +		     int datalen, int *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num)
> +{
> +	int out, in;
> +	int seg = 0;		/* iov index */
> +	int hc = 0;		/* head count */
> +
> +	while (datalen > 0) {
> +		if (hc >= VHOST_NET_MAX_SG)
> +			goto err;
> +		heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> +					      ARRAY_SIZE(vq->iov)-seg, &out,
> +					      &in, log, log_num);
> +		if (heads[hc].id == vq->num)
> +			goto err;
> +		if (out || in <= 0) {
> +			vq_err(vq, "unexpected descriptor format for RX: "
> +				"out %d, in %d\n", out, in);
> +			goto err;
> +		}
> +		heads[hc].len = iov_length(vq->iov+seg, in);
> +		datalen -= heads[hc].len;

This signed/unsigned mix makes me nervuous.
Let's make datalen unsigned, add unsigned total_len, and
while (datalen < total_len).

> +		hc++;
> +		seg += in;
> +	}
> +	*iovcount = seg;
> +	return hc;
> +err:
> +	vhost_discard_desc(vq, hc);
> +	return 0;
> +}
> +
>  /* This looks in the virtqueue and for the first available buffer, and
> converts
>   * it to an iovec for convenient access.  Since descriptors consist of
> some
>   * number of output then some number of input descriptors, it's
> actually two
> @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
>   *
>   * This function returns the descriptor number found, or vq->num (which
>   * is never a valid descriptor number) if none was found. */
> -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> vhost_virtqueue *vq,
> +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> *vq,
>  			   struct iovec iov[], unsigned int iov_size,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num)
> @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
>  }
>  
>  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling.
> */
> -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
>  {
> -	vq->last_avail_idx--;
> +	vq->last_avail_idx -= n;
>  }
>  
>  /* After we've used one of their buffers, we tell them about it.  We'll
> then
>   * want to notify the guest, using eventfd. */
> -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> len)
> +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> *heads,
> +		   int count)

I think we are better off with vhost_add_used and vhost_add_used_n:
the version with _n has a lot of extra complexity, and tx always
adds them 1 by one.

>  {
>  	struct vring_used_elem *used;
> +	int start, n;
> +
> +	if (count <= 0)
> +		return -EINVAL;
>  
> -	/* The virtqueue contains a ring of used buffers.  Get a pointer to
> the
> -	 * next entry in that used ring. */
> -	used = &vq->used->ring[vq->last_used_idx % vq->num];
> -	if (put_user(head, &used->id)) {
> -		vq_err(vq, "Failed to write used id");
> +	start = vq->last_used_idx % vq->num;
> +	if (vq->num - start < count)
> +		n = vq->num - start;
> +	else
> +		n = count;

use min?

> +	used = vq->used->ring + start;
> +	if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> +		vq_err(vq, "Failed to write used");
>  		return -EFAULT;
>  	}
> -	if (put_user(len, &used->len)) {
> -		vq_err(vq, "Failed to write used len");
> -		return -EFAULT;
> +	if (n < count) {	/* wrapped the ring */
> +		used = vq->used->ring;
> +		if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> +			vq_err(vq, "Failed to write used");
> +			return -EFAULT;
> +		}
>  	}
>  	/* Make sure buffer is written before we update index. */
>  	smp_wmb();
> -	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> +	if (put_user(vq->last_used_idx+count, &vq->used->idx)) {

I am a bit confused ... will this write a 32 or 16 bit value?
count is 32 bit ... Maybe we are better off with
  u16 idx = vq->last_used_idx + count
  put_user(idx, &vq->used->idx)
  vq->last_used_idx = idx

>  		vq_err(vq, "Failed to increment used idx");
>  		return -EFAULT;
>  	}
> @@ -1023,7 +1075,7 @@ int vhost_add_used(struct vhost_virtqueu
>  		if (vq->log_ctx)
>  			eventfd_signal(vq->log_ctx, 1);
>  	}
> -	vq->last_used_idx++;
> +	vq->last_used_idx += count;
>  	return 0;
>  }
>  
> @@ -1049,10 +1101,23 @@ void vhost_signal(struct vhost_dev *dev,
>  
>  /* And here's the combo meal deal.  Supersize me! */
>  void vhost_add_used_and_signal(struct vhost_dev *dev,
> -			       struct vhost_virtqueue *vq,
> -			       unsigned int head, int len)
> +			       struct vhost_virtqueue *vq, unsigned int id,
> +			       int len)
> +{
> +	struct vring_used_elem head;
> +
> +	head.id = id;
> +	head.len = len;
> +	vhost_add_used(vq, &head, 1);
> +	vhost_signal(dev, vq);
> +}
> +
> +/* multi-buffer version of vhost_add_used_and_signal */
> +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct vring_used_elem *heads, int count)
>  {
> -	vhost_add_used(vq, head, len);
> +	vhost_add_used(vq, heads, count);
>  	vhost_signal(dev, vq);
>  }
>  
> diff -ruNp net-next-p0/drivers/vhost/vhost.h
> net-next-v3/drivers/vhost/vhost.h
> --- net-next-p0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000
> -0700
> +++ net-next-v3/drivers/vhost/vhost.h	2010-04-05 20:33:57.000000000
> -0700
> @@ -85,6 +85,7 @@ struct vhost_virtqueue {
>  	struct iovec iov[VHOST_NET_MAX_SG];
>  	struct iovec hdr[VHOST_NET_MAX_SG];
>  	size_t hdr_size;
> +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
>  	/* We use a kind of RCU to access private pointer.
>  	 * All readers access it from workqueue, which makes it possible to
>  	 * flush the workqueue instead of synchronize_rcu. Therefore readers
> do
> @@ -120,16 +121,22 @@ long vhost_dev_ioctl(struct vhost_dev *,
>  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  int vhost_log_access_ok(struct vhost_dev *);
>  
> -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue
> *,
> +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem
> *heads,
> +		     int datalen, int *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num);
> +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
>  			   struct iovec iov[], unsigned int iov_count,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num);
> -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> +void vhost_discard_desc(struct vhost_virtqueue *, int);
>  
> -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int
> len);
> -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> +int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem
> *heads,
> +		    int count);
>  void vhost_add_used_and_signal(struct vhost_dev *, struct
> vhost_virtqueue *,
> -			       unsigned int head, int len);
> +			       unsigned int id, int len);
> +void vhost_add_used_and_signal_n(struct vhost_dev *, struct
> vhost_virtqueue *,
> +			       struct vring_used_elem *heads, int count);
> +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
>  void vhost_disable_notify(struct vhost_virtqueue *);
>  bool vhost_enable_notify(struct vhost_virtqueue *);
>  
> @@ -149,7 +156,8 @@ enum {
>  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
>  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
>  			 (1 << VHOST_F_LOG_ALL) |
> -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
>  };
>  
>  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> 

^ permalink raw reply

* [PATCH v2] can: Add esd board support to plx_pci CAN driver
From: Matthias Fuchs @ 2010-04-07 11:09 UTC (permalink / raw)
  To: netdev; +Cc: Socketcan-core

This patch adds support for SJA1000 based PCI CAN interface cards
from electronic system design gmbh.

Some changes have been done on the common code:
 - esd boards must not have the 2nd local interupt enabled (PLX9030/9050)
 - a new path for PLX9056/PEX8311 chips has been added
 - new plx9056 reset function has been implemented
 - struct plx_card_info got a reset function entry

In detail the following additional boards are now supported:

        CAN-PCI/200 (PCI)
        CAN-PCI/266 (PCI)
        CAN-PMC266 (PMC module)
        CAN-PCIe/2000 (PCI Express)
        CAN-CPCI/200 (Compact PCI, 3U)
        CAN-PCI104 (PCI104)

Signed-off-by: Matthias Fuchs <matthias.fuchs@esd.eu>
---
v2:
 - update Kconfig
 - add proper plx9056 reset function
 - add reset function pointer to plx_card_info structure
 - use card's reset function in plx_pci_del_card()

 drivers/net/can/sja1000/Kconfig   |    4 +-
 drivers/net/can/sja1000/plx_pci.c |  153 ++++++++++++++++++++++++++++++++++---
 2 files changed, 145 insertions(+), 12 deletions(-)

diff --git a/drivers/net/can/sja1000/Kconfig b/drivers/net/can/sja1000/Kconfig
index c01eb4e..b49f609 100644
--- a/drivers/net/can/sja1000/Kconfig
+++ b/drivers/net/can/sja1000/Kconfig
@@ -62,7 +62,9 @@ config CAN_PLX_PCI
 	  Driver supports now:
 	   - Adlink PCI-7841/cPCI-7841 card (http://www.adlinktech.com/)
 	   - Adlink PCI-7841/cPCI-7841 SE card
+	   - esd CAN-PCI/CPCI/PCI104/200 (http://www.esd.eu/)
+	   - esd CAN-PCI/PMC/266
+	   - esd CAN-PCIe/2000
 	   - Marathon CAN-bus-PCI card (http://www.marathon.ru/)
 	   - TEWS TECHNOLOGIES TPMC810 card (http://www.tews.com/)
-
 endif
diff --git a/drivers/net/can/sja1000/plx_pci.c b/drivers/net/can/sja1000/plx_pci.c
index 6b46a63..8c8ed15 100644
--- a/drivers/net/can/sja1000/plx_pci.c
+++ b/drivers/net/can/sja1000/plx_pci.c
@@ -40,7 +40,10 @@ MODULE_DESCRIPTION("Socket-CAN driver for PLX90xx PCI-bridge cards with "
 MODULE_SUPPORTED_DEVICE("Adlink PCI-7841/cPCI-7841, "
 			"Adlink PCI-7841/cPCI-7841 SE, "
 			"Marathon CAN-bus-PCI, "
-			"TEWS TECHNOLOGIES TPMC810");
+			"TEWS TECHNOLOGIES TPMC810, "
+			"esd CAN-PCI/CPCI/PCI104/200, "
+			"esd CAN-PCI/PMC/266, "
+			"esd CAN-PCIe/2000")
 MODULE_LICENSE("GPL v2");
 
 #define PLX_PCI_MAX_CHAN 2
@@ -49,11 +52,14 @@ struct plx_pci_card {
 	int channels;			/* detected channels count */
 	struct net_device *net_dev[PLX_PCI_MAX_CHAN];
 	void __iomem *conf_addr;
+
+	/* Pointer to device-dependent reset function */
+	void (*reset_func)(struct pci_dev *pdev);
 };
 
 #define PLX_PCI_CAN_CLOCK (16000000 / 2)
 
-/* PLX90xx registers */
+/* PLX9030/9050/9052 registers */
 #define PLX_INTCSR	0x4c		/* Interrupt Control/Status */
 #define PLX_CNTRL	0x50		/* User I/O, Direct Slave Response,
 					 * Serial EEPROM, and Initialization
@@ -65,6 +71,14 @@ struct plx_pci_card {
 #define PLX_PCI_INT_EN	(1 << 6)	/* PCI Interrupt Enable */
 #define PLX_PCI_RESET	(1 << 30)	/* PCI Adapter Software Reset */
 
+/* PLX9056 registers */
+#define PLX9056_INTCSR	0x68		/* Interrupt Control/Status */
+#define PLX9056_CNTRL	0x6c		/* Control / Software Reset */
+
+#define PLX9056_LINTI	(1 << 11)
+#define PLX9056_PCI_INT_EN (1 << 8)
+#define PLX9056_PCI_RCR	(1 << 29)	/* Read Configuration Registers */
+
 /*
  * The board configuration is probably following:
  * RX1 is connected to ground.
@@ -100,6 +114,13 @@ struct plx_pci_card {
 #define ADLINK_PCI_VENDOR_ID		0x144A
 #define ADLINK_PCI_DEVICE_ID		0x7841
 
+#define ESD_PCI_SUB_SYS_ID_PCI200	0x0004
+#define ESD_PCI_SUB_SYS_ID_PCI266	0x0009
+#define ESD_PCI_SUB_SYS_ID_PMC266	0x000e
+#define ESD_PCI_SUB_SYS_ID_CPCI200	0x010b
+#define ESD_PCI_SUB_SYS_ID_PCIE2000	0x0200
+#define ESD_PCI_SUB_SYS_ID_PCI104200	0x0501
+
 #define MARATHON_PCI_DEVICE_ID		0x2715
 
 #define TEWS_PCI_VENDOR_ID		0x1498
@@ -107,6 +128,7 @@ struct plx_pci_card {
 
 static void plx_pci_reset_common(struct pci_dev *pdev);
 static void plx_pci_reset_marathon(struct pci_dev *pdev);
+static void plx9056_pci_reset_common(struct pci_dev *pdev);
 
 struct plx_pci_channel_map {
 	u32 bar;
@@ -147,6 +169,30 @@ static struct plx_pci_card_info plx_pci_card_info_adlink_se __devinitdata = {
 	/* based on PLX9052 */
 };
 
+static struct plx_pci_card_info plx_pci_card_info_esd200 __devinitdata = {
+	"esd CAN-PCI/CPCI/PCI104/200", 2,
+	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
+	{0, 0x00, 0x00}, { {2, 0x00, 0x80}, {2, 0x100, 0x80} },
+	&plx_pci_reset_common
+	/* based on PLX9030/9050 */
+};
+
+static struct plx_pci_card_info plx_pci_card_info_esd266 __devinitdata = {
+	"esd CAN-PCI/PMC/266", 2,
+	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
+	{0, 0x00, 0x00}, { {2, 0x00, 0x80}, {2, 0x100, 0x80} },
+	&plx9056_pci_reset_common
+	/* based on PLX9056 */
+};
+
+static struct plx_pci_card_info plx_pci_card_info_esd2000 __devinitdata = {
+	"esd CAN-PCIe/2000", 2,
+	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
+	{0, 0x00, 0x00}, { {2, 0x00, 0x80}, {2, 0x100, 0x80} },
+	&plx9056_pci_reset_common
+	/* based on PEX8311 */
+};
+
 static struct plx_pci_card_info plx_pci_card_info_marathon __devinitdata = {
 	"Marathon CAN-bus-PCI", 2,
 	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
@@ -179,6 +225,48 @@ static DEFINE_PCI_DEVICE_TABLE(plx_pci_tbl) = {
 		(kernel_ulong_t)&plx_pci_card_info_adlink_se
 	},
 	{
+		/* esd CAN-PCI/200 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCI200,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd200
+	},
+	{
+		/* esd CAN-CPCI/200 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9030,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_CPCI200,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd200
+	},
+	{
+		/* esd CAN-PCI104/200 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9030,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCI104200,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd200
+	},
+	{
+		/* esd CAN-PCI/266 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9056,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCI266,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd266
+	},
+	{
+		/* esd CAN-PMC/266 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9056,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PMC266,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd266
+	},
+	{
+		/* esd CAN-PCIE/2000 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9056,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCIE2000,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd2000
+	},
+	{
 		/* Marathon CAN-bus-PCI card */
 		PCI_VENDOR_ID_PLX, MARATHON_PCI_DEVICE_ID,
 		PCI_ANY_ID, PCI_ANY_ID,
@@ -241,7 +329,7 @@ static inline int plx_pci_check_sja1000(const struct sja1000_priv *priv)
 }
 
 /*
- * PLX90xx software reset
+ * PLX9030/50/52 software reset
  * Also LRESET# asserts and brings to reset device on the Local Bus (if wired).
  * For most cards it's enough for reset the SJA1000 chips.
  */
@@ -258,6 +346,38 @@ static void plx_pci_reset_common(struct pci_dev *pdev)
 	iowrite32(cntrl, card->conf_addr + PLX_CNTRL);
 };
 
+/*
+ * PLX9056 software reset
+ * Assert LRESET# and reset device(s) on the Local Bus (if wired).
+ */
+static void plx9056_pci_reset_common(struct pci_dev *pdev)
+{
+	struct plx_pci_card *card = pci_get_drvdata(pdev);
+	u32 cntrl;
+
+	/* issue a local bus reset */
+	cntrl = ioread32(card->conf_addr + PLX9056_CNTRL);
+	cntrl |= PLX_PCI_RESET;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+	udelay(100);
+	cntrl ^= PLX_PCI_RESET;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+
+	/* reload local configuration from EEPROM */
+	cntrl |= PLX9056_PCI_RCR;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+
+	/*
+	 * There is no safe way to poll for the end
+	 * of reconfiguration process. Waiting for 10ms
+	 * is safe.
+	 */
+	mdelay(10);
+
+	cntrl ^= PLX9056_PCI_RCR;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+};
+
 /* Special reset function for Marathon card */
 static void plx_pci_reset_marathon(struct pci_dev *pdev)
 {
@@ -301,13 +421,16 @@ static void plx_pci_del_card(struct pci_dev *pdev)
 		free_sja1000dev(dev);
 	}
 
-	plx_pci_reset_common(pdev);
+	card->reset_func(pdev);
 
 	/*
-	 * Disable interrupts from PCI-card (PLX90xx) and disable Local_1,
-	 * Local_2 interrupts
+	 * Disable interrupts from PCI-card and disable local
+	 * interrupts
 	 */
-	iowrite32(0x0, card->conf_addr + PLX_INTCSR);
+	if (pdev->device != PCI_DEVICE_ID_PLX_9056)
+		iowrite32(0x0, card->conf_addr + PLX_INTCSR);
+	else
+		iowrite32(0x0, card->conf_addr + PLX9056_INTCSR);
 
 	if (card->conf_addr)
 		pci_iounmap(pdev, card->conf_addr);
@@ -366,6 +489,7 @@ static int __devinit plx_pci_add_card(struct pci_dev *pdev,
 	card->conf_addr = addr + ci->conf_map.offset;
 
 	ci->reset_func(pdev);
+	card->reset_func = ci->reset_func;
 
 	/* Detect available channels */
 	for (i = 0; i < ci->channel_count; i++) {
@@ -437,10 +561,17 @@ static int __devinit plx_pci_add_card(struct pci_dev *pdev,
 	 * Enable interrupts from PCI-card (PLX90xx) and enable Local_1,
 	 * Local_2 interrupts from the SJA1000 chips
 	 */
-	val = ioread32(card->conf_addr + PLX_INTCSR);
-	val |= PLX_LINT1_EN | PLX_LINT2_EN | PLX_PCI_INT_EN;
-	iowrite32(val, card->conf_addr + PLX_INTCSR);
-
+	if (pdev->device != PCI_DEVICE_ID_PLX_9056) {
+		val = ioread32(card->conf_addr + PLX_INTCSR);
+		if (pdev->subsystem_vendor == PCI_VENDOR_ID_ESDGMBH)
+			val |= PLX_LINT1_EN | PLX_PCI_INT_EN;
+		else
+			val |= PLX_LINT1_EN | PLX_LINT2_EN | PLX_PCI_INT_EN;
+		iowrite32(val, card->conf_addr + PLX_INTCSR);
+	} else {
+		iowrite32(PLX9056_LINTI | PLX9056_PCI_INT_EN,
+			  card->conf_addr + PLX9056_INTCSR);
+	}
 	return 0;
 
 failure_cleanup:
-- 
1.6.1


^ permalink raw reply related

* Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-07 11:17 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mingo, jdike
In-Reply-To: <1270630839-19876-1-git-send-email-xiaohui.xin@intel.com>

On Wed, Apr 07, 2010 at 05:00:39PM +0800, xiaohui.xin@intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 
> ---
> 
> Michael,
> Thanks a lot for the explanation. I have drafted a patch for the qemu write
> after I looked into tun driver. Does it do in right way?
> 
> Thanks
> Xiaohui
> 
>  drivers/vhost/mpassthru.c |   45 +++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 45 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
> index e9449ac..1cde097 100644
> --- a/drivers/vhost/mpassthru.c
> +++ b/drivers/vhost/mpassthru.c
> @@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
>  	return mask;
>  }
>  
> +static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
> +				unsigned long count, loff_t pos)
> +{
> +	struct file *file = iocb->ki_filp;
> +	struct mp_struct *mp = mp_get(file->private_data);
> +	struct sock *sk = mp->socket.sk;
> +	struct sk_buff *skb;
> +	int len, err;
> +	ssize_t result;
> +
> +	if (!mp)
> +		return -EBADFD;
> +

Can this happen? When?

> +	/* currently, async is not supported */
> +	if (!is_sync_kiocb(iocb))
> +		return -EFAULT;

Really necessary? I think do_sync_write handles all this.

> +
> +	len = iov_length(iov, count);
> +	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
> +				  file->f_flags & O_NONBLOCK, &err);
> +
> +	if (!skb)
> +		return -EFAULT;

Surely not EFAULT. -EAGAIN?

> +
> +	skb_reserve(skb, NET_IP_ALIGN);
> +	skb_put(skb, len);
> +
> +	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
> +		kfree_skb(skb);
> +		return -EFAULT;
> +	}
> +	skb_set_network_header(skb, ETH_HLEN);

Is this really right or necessary? Also,
probably need to check that length is at least ETH_ALEN before
doing this.

> +	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);

eth_type_trans?

> +	skb->dev = mp->dev;
> +
> +	dev_queue_xmit(skb);
> +	mp->dev->stats.tx_packets++;
> +	mp->dev->stats.tx_bytes += len;

Doesn't the hard start xmit function for the device
increment the counters?

> +
> +	mp_put(mp);
> +	return result;
> +}
> +
>  static int mp_chr_close(struct inode *inode, struct file *file)
>  {
>  	struct mp_file *mfile = file->private_data;
> @@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct file *file)
>  static const struct file_operations mp_fops = {
>  	.owner  = THIS_MODULE,
>  	.llseek = no_llseek,
> +	.write  = do_sync_write,
> +	.aio_write = mp_chr_aio_write,
>  	.poll   = mp_chr_poll,
>  	.unlocked_ioctl = mp_chr_ioctl,
>  	.open   = mp_chr_open,
> -- 
> 1.5.4.4

^ permalink raw reply

* [PATCH] r6040: fix r6040_multicast_list
From: Florian Fainelli @ 2010-04-07 11:18 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, darkadept

As reported in <https://bugzilla.kernel.org/show_bug.cgi?id=15355>, r6040_
multicast_list currently crashes. This is due a wrong maximum of multicast
entries. This patch fixes the following issues with multicast:

- number of maximum entries if off-by-one (4 instead of 3)

- the writing of the hash table index is not necessary and leads to invalid
values being written into the MCR1 register, so the MAC is simply put in a non
coherent state

- when we exceed the maximum number of mutlticast address, writing the
broadcast address should be done in registers MID_1{L,M,H} instead of
MID_O{L,M,H}, otherwise we would loose the adapter's MAC address

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index f5a0e96..83ebc75 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -135,7 +135,7 @@
 #define RX_DESC_SIZE	(RX_DCNT * sizeof(struct r6040_descriptor))
 #define TX_DESC_SIZE	(TX_DCNT * sizeof(struct r6040_descriptor))
 #define MBCR_DEFAULT	0x012A	/* MAC Bus Control Register */
-#define MCAST_MAX	4	/* Max number multicast addresses to filter */
+#define MCAST_MAX	3	/* Max number multicast addresses to filter */
 
 /* Descriptor status */
 #define DSC_OWNER_MAC	0x8000	/* MAC is the owner of this descriptor */
@@ -983,9 +983,6 @@ static void r6040_multicast_list(struct net_device *dev)
 			crc >>= 26;
 			hash_table[crc >> 4] |= 1 << (15 - (crc & 0xf));
 		}
-		/* Write the index of the hash table */
-		for (i = 0; i < 4; i++)
-			iowrite16(hash_table[i] << 14, ioaddr + MCR1);
 		/* Fill the MAC hash tables with their values */
 		iowrite16(hash_table[0], ioaddr + MAR0);
 		iowrite16(hash_table[1], ioaddr + MAR1);
@@ -1001,9 +998,9 @@ static void r6040_multicast_list(struct net_device *dev)
 			iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
 			iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
 		} else {
-			iowrite16(0xffff, ioaddr + MID_0L + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_0M + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_0H + 8 * i);
+			iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
+			iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
+			iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
 		}
 		i++;
 	}

^ permalink raw reply related

* Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net
From: Michael S. Tsirkin @ 2010-04-07 11:35 UTC (permalink / raw)
  To: David L Stevens; +Cc: kvm, netdev, rusty, virtualization
In-Reply-To: <20100407105910.GD9550@redhat.com>

Some corrections:

On Wed, Apr 07, 2010 at 01:59:10PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
> > 
> > This patch adds support for the Mergeable Receive Buffers feature to
> > vhost_net.
> > 
> > 						+-DLS
> > 
> > Changes from previous revision:
> > 1) renamed:
> > 	vhost_discard_vq_desc -> vhost_discard_desc
> > 	vhost_get_heads -> vhost_get_desc_n
> > 	vhost_get_vq_desc -> vhost_get_desc
> > 2) added heads as argument to ghost_get_desc_n
> > 3) changed "vq->heads" from iovec to vring_used_elem, removed casts
> > 4) changed vhost_add_used to do multiple elements in a single
> > copy_to_user,
> > 	or two when we wrap the ring.
> > 5) removed rxmaxheadcount and available buffer checks in favor of
> > running until
> > 	an allocation failure, but making sure we break the loop if we get
> > 	two in a row, indicating we have at least 1 buffer, but not enough
> > 	for the current receive packet
> > 6) restore non-vnet header handling
> > 
> > Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>
> 
> Thanks!
> There's some whitespace damage, are you sending with your new
> sendmail setup? It seems to have worked for qemu patches ...
> 
> > diff -ruNp net-next-p0/drivers/vhost/net.c
> > net-next-v3/drivers/vhost/net.c
> > --- net-next-p0/drivers/vhost/net.c	2010-03-22 12:04:38.000000000 -0700
> > +++ net-next-v3/drivers/vhost/net.c	2010-04-06 12:54:56.000000000 -0700
> > @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
> >  	hdr_size = vq->hdr_size;
> >  
> >  	for (;;) {
> > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -					 ARRAY_SIZE(vq->iov),
> > -					 &out, &in,
> > +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> > +					 ARRAY_SIZE(vq->iov), &out, &in,
> >  					 NULL, NULL);
> >  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> >  		if (head == vq->num) {
> > @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
> >  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> >  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
> >  		if (unlikely(err < 0)) {
> > -			vhost_discard_vq_desc(vq);
> > -			tx_poll_start(net, sock);
> > +			if (err == -EAGAIN) {
> > +				vhost_discard_desc(vq, 1);
> > +				tx_poll_start(net, sock);
> > +			} else {
> > +				vq_err(vq, "sendmsg: errno %d\n", -err);
> > +				/* drop packet; do not discard/resend */
> > +				vhost_add_used_and_signal(&net->dev, vq, head,
> > +							  0);
> 
> vhost does not currently has a consistent error handling strategy:
> if we drop packets, need to think which other errors should cause
> packet drops.  I prefer to just call vq_err for now,
> and have us look at handling segfaults etc in a consistent way
> separately.
> 
> > +			}
> >  			break;
> >  		}
> >  		if (err != len)
> > @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
> >  	unuse_mm(net->dev.mm);
> >  }
> >  
> > +static int vhost_head_len(struct sock *sk)
> > +{
> > +	struct sk_buff *head;
> > +	int len = 0;
> > +
> > +	lock_sock(sk);
> > +	head = skb_peek(&sk->sk_receive_queue);
> > +	if (head)
> > +		len = head->len;
> > +	release_sock(sk);
> > +	return len;
> > +}
> > +
> 
> I wonder whether it makes sense to check
> skb_queue_empty(&sk->sk_receive_queue)
> outside the lock, to reduce the cost of this call
> on an empty queue (we know that it happens at least once
> each time we exit the loop on rx)?
> 
> >  /* Expects to be always run from workqueue - which acts as
> >   * read-size critical section for our kind of RCU. */
> >  static void handle_rx(struct vhost_net *net)
> >  {
> >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > -	unsigned head, out, in, log, s;
> > +	unsigned in, log, s;
> >  	struct vhost_log *vq_log;
> >  	struct msghdr msg = {
> >  		.msg_name = NULL,
> > @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
> >  		.msg_flags = MSG_DONTWAIT,
> >  	};
> >  
> > -	struct virtio_net_hdr hdr = {
> > -		.flags = 0,
> > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> > +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> > +		.hdr.flags = 0,
> > +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
> >  	};
> >  
> > +	int retries = 0;
> >  	size_t len, total_len = 0;
> > -	int err;
> > +	int err, headcount, datalen;
> >  	size_t hdr_size;
> >  	struct socket *sock = rcu_dereference(vq->private_data);
> >  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
> >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >  		vq->log : NULL;
> >  
> > -	for (;;) {
> > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -					 ARRAY_SIZE(vq->iov),
> > -					 &out, &in,
> > -					 vq_log, &log);
> > +	while ((datalen = vhost_head_len(sock->sk))) {
> > +		headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> > +					     vq_log, &log);
> 
> This looks like a bug, I think we need to pass
> datalen + header size to vhost_get_desc_n.
> Not sure how we know the header size that backend will use though.
> Maybe just look at our features.
> 
> >  		/* OK, now we need to know about added descriptors. */
> > -		if (head == vq->num) {
> > -			if (unlikely(vhost_enable_notify(vq))) {
> > +		if (!headcount) {
> > +			if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
> >  				/* They have slipped one in as we were
> >  				 * doing that: check again. */
> >  				vhost_disable_notify(vq);
> > +				retries++;
> >  				continue;
> >  			}
> 
> Hmm. The reason we have the code at all, as the comment says, is because
> guest could have added more buffers between the time we read last index
> and the time we enabled notification. So if we just break like this
> the race still exists. We could remember the
> last head value we observed, and have vhost_enable_notify check
> against this value?
> 
> Need to think about it.
> 
> Another concern here is that on retries vhost_get_desc_n
> is doing extra work, rescanning the same descriptor
> again and again. Not sure how common this is, might be
> worthwhile to add a TODO to consider this at least.
> 
> > +			retries = 0;
> >  			/* Nothing new?  Wait for eventfd to tell us
> >  			 * they refilled. */
> >  			break;
> >  		}
> >  		/* We don't need to be notified again. */
> > -		if (out) {
> > -			vq_err(vq, "Unexpected descriptor format for RX: "
> > -			       "out %d, int %d\n",
> > -			       out, in);
> > -			break;
> > -		}
> > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> > +		/* Skip header. TODO: support TSO. */
> >  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> >  		msg.msg_iovlen = in;
> >  		len = iov_length(vq->iov, in);
> > @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
> >  					 len, MSG_DONTWAIT | MSG_TRUNC);
> >  		/* TODO: Check specific error and bomb out unless EAGAIN? */
> >  		if (err < 0) {
> 
> I think we need to compare err and datalen and drop packet on mismatch as well.
> The check err > len won't be needed then.
> 
> > -			vhost_discard_vq_desc(vq);
> > +			vhost_discard_desc(vq, headcount);
> >  			break;
> >  		}
> >  		/* TODO: Should check and handle checksum. */
> > +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> > +			struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > +				(struct virtio_net_hdr_mrg_rxbuf *)
> > +				vq->iov[0].iov_base;
> > +			/* add num_buffers */
> > +			if (vhost_has_feature(&net->dev,
> > +					      VHOST_NET_F_VIRTIO_NET_HDR))
> > +				hdr.num_buffers = headcount;
> 
> Why is the above necessary?
> 
> > +			else if (vq->iov[0].iov_len < sizeof(*vhdr)) {
> 
> I think length check is already done when we copy the header. No?
> 
> > +				vq_err(vq, "tiny buffers < %d unsupported",
> > +					vq->iov[0].iov_len);
> > +				vhost_discard_desc(vq, headcount);
> > +				break;
> 
> Problem here is that recvmsg might modify iov.
> This is why I think we need to use vq->hdr here.
> 
> > +			} else if (put_user(headcount, &vhdr->num_buffers)) {
> 
> The above put_user writes out a 32 bit value, right?
> This seems wrong.

Sorry, put_user looks at the pointer type, so that's ok.
I still suggest memcpy_toiovecend to remove layout assumptions.

> How about using
> 	memcpy_toiovecend(vq->hdr, &headcount,
> 			  offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
> 			  sizeof headcount);
> 
> this way we also do not make any assumptions about layout.
> 
> > +				vq_err(vq, "Failed num_buffers write");
> > +				vhost_discard_desc(vq, headcount);
> > +				break;
> > +			}
> > +		}
> >  		if (err > len) {
> >  			pr_err("Discarded truncated rx packet: "
> >  			       " len %d > %zd\n", err, len);
> > -			vhost_discard_vq_desc(vq);
> > +			vhost_discard_desc(vq, headcount);
> >  			continue;
> >  		}
> >  		len = err;
> > @@ -279,7 +312,7 @@ static void handle_rx(struct vhost_net *
> >  			break;
> >  		}
> >  		len += hdr_size;
> > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> > +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount);
> >  		if (unlikely(vq_log))
> >  			vhost_log_write(vq, vq_log, log, len);
> >  		total_len += len;
> > @@ -560,9 +593,14 @@ done:
> >  
> >  static int vhost_net_set_features(struct vhost_net *n, u64 features)
> >  {
> > -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> > -		sizeof(struct virtio_net_hdr) : 0;
> > +	size_t hdr_size = 0;
> >  	int i;
> > +
> > +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> > +		hdr_size = sizeof(struct virtio_net_hdr);
> > +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > +			hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> My personal style for this would be:
>   	if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
> 		hdr_size = 0
> 	else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
> 		hdr_size = sizeof(virtio_net_hdr);
> 	else
> 		hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> which results in more symmetry and less nesting.
> 
> >  	mutex_lock(&n->dev.mutex);
> >  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
> >  	    !vhost_log_access_ok(&n->dev)) {
> > diff -ruNp net-next-p0/drivers/vhost/vhost.c
> > net-next-v3/drivers/vhost/vhost.c
> > --- net-next-p0/drivers/vhost/vhost.c	2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.c	2010-04-06 12:57:51.000000000
> > -0700
> > @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
> >  	return 0;
> >  }
> >  
> > +/* This is a multi-buffer version of vhost_get_vq_desc
> > + * @vq		- the relevant virtqueue
> > + * datalen	- data length we'll be reading
> > + * @iovcount	- returned count of io vectors we fill
> > + * @log		- vhost log
> > + * @log_num	- log offset
> > + *	returns number of buffer heads allocated, 0 on error
> 
> This is unusual. Let's return a negative error code on error.
> 
> > + */
> > +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +		     int datalen, int *iovcount, struct vhost_log *log,
> > +		     unsigned int *log_num)
> > +{
> > +	int out, in;
> > +	int seg = 0;		/* iov index */
> > +	int hc = 0;		/* head count */
> > +
> > +	while (datalen > 0) {
> > +		if (hc >= VHOST_NET_MAX_SG)
> > +			goto err;
> > +		heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> > +					      ARRAY_SIZE(vq->iov)-seg, &out,
> > +					      &in, log, log_num);
> > +		if (heads[hc].id == vq->num)
> > +			goto err;
> > +		if (out || in <= 0) {
> > +			vq_err(vq, "unexpected descriptor format for RX: "
> > +				"out %d, in %d\n", out, in);
> > +			goto err;
> > +		}
> > +		heads[hc].len = iov_length(vq->iov+seg, in);
> > +		datalen -= heads[hc].len;
> 
> This signed/unsigned mix makes me nervuous.
> Let's make datalen unsigned, add unsigned total_len, and
> while (datalen < total_len).
> 
> > +		hc++;
> > +		seg += in;
> > +	}
> > +	*iovcount = seg;
> > +	return hc;
> > +err:
> > +	vhost_discard_desc(vq, hc);
> > +	return 0;
> > +}
> > +
> >  /* This looks in the virtqueue and for the first available buffer, and
> > converts
> >   * it to an iovec for convenient access.  Since descriptors consist of
> > some
> >   * number of output then some number of input descriptors, it's
> > actually two
> > @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
> >   *
> >   * This function returns the descriptor number found, or vq->num (which
> >   * is never a valid descriptor number) if none was found. */
> > -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> > vhost_virtqueue *vq,
> > +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> > *vq,
> >  			   struct iovec iov[], unsigned int iov_size,
> >  			   unsigned int *out_num, unsigned int *in_num,
> >  			   struct vhost_log *log, unsigned int *log_num)
> > @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
> >  }
> >  
> >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling.
> > */
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> > +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
> >  {
> > -	vq->last_avail_idx--;
> > +	vq->last_avail_idx -= n;
> >  }
> >  
> >  /* After we've used one of their buffers, we tell them about it.  We'll
> > then
> >   * want to notify the guest, using eventfd. */
> > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> > len)
> > +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +		   int count)
> 
> I think we are better off with vhost_add_used and vhost_add_used_n:
> the version with _n has a lot of extra complexity, and tx always
> adds them 1 by one.
> 
> >  {
> >  	struct vring_used_elem *used;
> > +	int start, n;
> > +
> > +	if (count <= 0)
> > +		return -EINVAL;
> >  
> > -	/* The virtqueue contains a ring of used buffers.  Get a pointer to
> > the
> > -	 * next entry in that used ring. */
> > -	used = &vq->used->ring[vq->last_used_idx % vq->num];
> > -	if (put_user(head, &used->id)) {
> > -		vq_err(vq, "Failed to write used id");
> > +	start = vq->last_used_idx % vq->num;
> > +	if (vq->num - start < count)
> > +		n = vq->num - start;
> > +	else
> > +		n = count;
> 
> use min?
> 
> > +	used = vq->used->ring + start;
> > +	if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> > +		vq_err(vq, "Failed to write used");
> >  		return -EFAULT;
> >  	}
> > -	if (put_user(len, &used->len)) {
> > -		vq_err(vq, "Failed to write used len");
> > -		return -EFAULT;
> > +	if (n < count) {	/* wrapped the ring */
> > +		used = vq->used->ring;
> > +		if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> > +			vq_err(vq, "Failed to write used");
> > +			return -EFAULT;
> > +		}
> >  	}
> >  	/* Make sure buffer is written before we update index. */
> >  	smp_wmb();
> > -	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> > +	if (put_user(vq->last_used_idx+count, &vq->used->idx)) {
> 
> I am a bit confused ... will this write a 32 or 16 bit value?
> count is 32 bit ... Maybe we are better off with
>   u16 idx = vq->last_used_idx + count
>   put_user(idx, &vq->used->idx)
>   vq->last_used_idx = idx

The above's not necessary, put_user gets type from the pointer.

> >  		vq_err(vq, "Failed to increment used idx");
> >  		return -EFAULT;
> >  	}
> > @@ -1023,7 +1075,7 @@ int vhost_add_used(struct vhost_virtqueu
> >  		if (vq->log_ctx)
> >  			eventfd_signal(vq->log_ctx, 1);
> >  	}
> > -	vq->last_used_idx++;
> > +	vq->last_used_idx += count;
> >  	return 0;
> >  }
> >  
> > @@ -1049,10 +1101,23 @@ void vhost_signal(struct vhost_dev *dev,
> >  
> >  /* And here's the combo meal deal.  Supersize me! */
> >  void vhost_add_used_and_signal(struct vhost_dev *dev,
> > -			       struct vhost_virtqueue *vq,
> > -			       unsigned int head, int len)
> > +			       struct vhost_virtqueue *vq, unsigned int id,
> > +			       int len)
> > +{
> > +	struct vring_used_elem head;
> > +
> > +	head.id = id;
> > +	head.len = len;
> > +	vhost_add_used(vq, &head, 1);
> > +	vhost_signal(dev, vq);
> > +}
> > +
> > +/* multi-buffer version of vhost_add_used_and_signal */
> > +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct vring_used_elem *heads, int count)
> >  {
> > -	vhost_add_used(vq, head, len);
> > +	vhost_add_used(vq, heads, count);
> >  	vhost_signal(dev, vq);
> >  }
> >  
> > diff -ruNp net-next-p0/drivers/vhost/vhost.h
> > net-next-v3/drivers/vhost/vhost.h
> > --- net-next-p0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.h	2010-04-05 20:33:57.000000000
> > -0700
> > @@ -85,6 +85,7 @@ struct vhost_virtqueue {
> >  	struct iovec iov[VHOST_NET_MAX_SG];
> >  	struct iovec hdr[VHOST_NET_MAX_SG];
> >  	size_t hdr_size;
> > +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
> >  	/* We use a kind of RCU to access private pointer.
> >  	 * All readers access it from workqueue, which makes it possible to
> >  	 * flush the workqueue instead of synchronize_rcu. Therefore readers
> > do
> > @@ -120,16 +121,22 @@ long vhost_dev_ioctl(struct vhost_dev *,
> >  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
> >  int vhost_log_access_ok(struct vhost_dev *);
> >  
> > -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue
> > *,
> > +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem
> > *heads,
> > +		     int datalen, int *iovcount, struct vhost_log *log,
> > +		     unsigned int *log_num);
> > +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
> >  			   struct iovec iov[], unsigned int iov_count,
> >  			   unsigned int *out_num, unsigned int *in_num,
> >  			   struct vhost_log *log, unsigned int *log_num);
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> > +void vhost_discard_desc(struct vhost_virtqueue *, int);
> >  
> > -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int
> > len);
> > -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> > +int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem
> > *heads,
> > +		    int count);
> >  void vhost_add_used_and_signal(struct vhost_dev *, struct
> > vhost_virtqueue *,
> > -			       unsigned int head, int len);
> > +			       unsigned int id, int len);
> > +void vhost_add_used_and_signal_n(struct vhost_dev *, struct
> > vhost_virtqueue *,
> > +			       struct vring_used_elem *heads, int count);
> > +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> >  void vhost_disable_notify(struct vhost_virtqueue *);
> >  bool vhost_enable_notify(struct vhost_virtqueue *);
> >  
> > @@ -149,7 +156,8 @@ enum {
> >  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
> >  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
> >  			 (1 << VHOST_F_LOG_ALL) |
> > -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> > +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> > +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
> >  };
> >  
> >  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> > 

^ permalink raw reply

* [PATCH] corrected documentation for hardware time stamping
From: Patrick Loschmidt @ 2010-04-07 11:15 UTC (permalink / raw)
  To: netdev; +Cc: Patrick Ohly

From: Patrick Loschmidt <Patrick.Loschmidt@oeaw.ac.at>

The current documentation for hardware time stamping does not correctly specify the available kernel
functions since the implementation was changed later on.

Signed-off-by: Patrick Loschmidt <Patrick.Loschmidt@oeaw.ac.at>
---
--- Documentation/networking/timestamping.txt.orig	2010-04-07 12:52:47.000000000 +0200
+++ Documentation/networking/timestamping.txt	2010-04-07 11:43:57.000000000 +0200
@@ -41,11 +41,12 @@ SOF_TIMESTAMPING_SOFTWARE:     return sy
 SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
 SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
 following control message:
-    struct scm_timestamping {
-           struct timespec systime;
-           struct timespec hwtimetrans;
-           struct timespec hwtimeraw;
-    };
+
+struct scm_timestamping {
+	struct timespec systime;
+	struct timespec hwtimetrans;
+	struct timespec hwtimeraw;
+};

 recvmsg() can be used to get this control message for regular incoming
 packets. For send time stamps the outgoing packet is looped back to
@@ -87,12 +88,13 @@ by the network device and will be empty
 SIOCSHWTSTAMP:

 Hardware time stamping must also be initialized for each device driver
-that is expected to do hardware time stamping. The parameter is:
+that is expected to do hardware time stamping. The parameter is defined in
+/include/linux/net_tstamp.h as:

 struct hwtstamp_config {
-    int flags;           /* no flags defined right now, must be zero */
-    int tx_type;         /* HWTSTAMP_TX_* */
-    int rx_filter;       /* HWTSTAMP_FILTER_* */
+	int flags;	/* no flags defined right now, must be zero */
+	int tx_type;	/* HWTSTAMP_TX_* */
+	int rx_filter;	/* HWTSTAMP_FILTER_* */
 };

 Desired behavior is passed into the kernel and to a specific device by
@@ -139,42 +141,56 @@ enum {
 	/* time stamp any incoming packet */
 	HWTSTAMP_FILTER_ALL,

-        /* return value: time stamp all packets requested plus some others */
-        HWTSTAMP_FILTER_SOME,
+	/* return value: time stamp all packets requested plus some others */
+	HWTSTAMP_FILTER_SOME,

 	/* PTP v1, UDP, any kind of event packet */
 	HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
-
-        ...
+	
+	/* for the complete list of values, please check
+	 * the include file /include/linux/net_tstamp.h
+	 */
 };


 DEVICE IMPLEMENTATION

 A driver which supports hardware time stamping must support the
-SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored
-in the skb with skb_hwtstamp_set().
+SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with
+the actual values as described in the section on SIOCSHWTSTAMP.
+
+Time stamps for received packets must be stored in the skb. To get a pointer
+to the shared time stamp structure of the skb call skb_hwtstamps(). Then
+set the time stamps in the structure:
+
+struct skb_shared_hwtstamps {
+	/* hardware time stamp transformed into duration
+	 * since arbitrary point in time
+	 */
+	ktime_t	hwtstamp;
+	ktime_t	syststamp; /* hwtstamp transformed to system time base */
+};

 Time stamps for outgoing packets are to be generated as follows:
-- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware()
-  returns non-zero. If yes, then the driver is expected
-  to do hardware time stamping.
+- In hard_start_xmit(), check if skb_tx(skb)->hardware is set no-zero.
+  If yes, then the driver is expected to do hardware time stamping.
 - If this is possible for the skb and requested, then declare
-  that the driver is doing the time stamping by calling
-  skb_hwtstamp_tx_in_progress(). A driver not supporting
-  hardware time stamping doesn't do that. A driver must never
-  touch sk_buff::tstamp! It is used to store how time stamping
-  for an outgoing packets is to be done.
+  that the driver is doing the time stamping by setting the field
+  skb_tx(skb)->in_progress non-zero. You might want to keep a pointer
+  to the associated skb for the next step and not free the skb. A driver
+  not supporting hardware time stamping doesn't do that. A driver must
+  never touch sk_buff::tstamp! It is used to store software generated
+  time stamps by the network subsystem.
 - As soon as the driver has sent the packet and/or obtained a
   hardware time stamp for it, it passes the time stamp back by
   calling skb_hwtstamp_tx() with the original skb, the raw
-  hardware time stamp and a handle to the device (necessary
-  to convert the hardware time stamp to system time). If obtaining
-  the hardware time stamp somehow fails, then the driver should
-  not fall back to software time stamping. The rationale is that
-  this would occur at a later time in the processing pipeline
-  than other software time stamping and therefore could lead
-  to unexpected deltas between time stamps.
-- If the driver did not call skb_hwtstamp_tx_in_progress(), then
+  hardware time stamp. skb_hwtstamp_tx() clones the original skb and
+  adds the timestamps, therefore the original skb has to be freed now.
+  If obtaining the hardware time stamp somehow fails, then the driver
+  should not fall back to software time stamping. The rationale is that
+  this would occur at a later time in the processing pipeline than other
+  software time stamping and therefore could lead to unexpected deltas
+  between time stamps.
+- If the driver did not call set skb_tx(skb)->in_progress, then
   dev_hard_start_xmit() checks whether software time stamping
   is wanted as fallback and potentially generates the time stamp.

^ permalink raw reply

* Re: [Bugme-new] [Bug 15682] New: XFRM is not updating RTAX_ADVMSS metric
From: jamal @ 2010-04-07 12:19 UTC (permalink / raw)
  To: Eduardo Panisset
  Cc: Andrew Morton, Herbert Xu, hideaki, netdev, bugzilla-daemon,
	bugme-daemon, David S. Miller
In-Reply-To: <y2jb7b22e81004061202uac6ef8eeodc70d6deb65f7699@mail.gmail.com>

Hi Eduardo,

As a first step, I dont know what the real cause is but 
what you showed as a solution didnt look right. I am also not familiar
with DSMIPv6. I know a lot of other xfrm subsystems work fine with
the current code. So thats why i pointed to Herbert who is more
knowledgeable. I am adding Yoshfuji to the list. 
If you tell me what kernel options to turn on and a very
basic test setup to reproduce it (I dont have much hardware, so make it
very minimal maybe requiring one or two PCs max) then I could make time
and try to reproduce it.

cheers,
jamal

On Tue, 2010-04-06 at 16:02 -0300, Eduardo Panisset wrote:
> Hi,
> 
> My intention is only to report a problem that I have faced. The
> solution proposed isn't (I know that) probably the best one to adopt
> as I'm not a kernel specialist, it is more illustrative to allow you
> guys understanding what I'm meaning and solve that on the better way
> (hence I haven't submited a patch).
> 
> Regards,
> Eduardo Panisset.

^ permalink raw reply

* [PATCH] Caif: Ref counting
From: Alan Cox @ 2010-04-07 13:13 UTC (permalink / raw)
  To: Sjur BRENDELAND; +Cc: netdev@vger.kernel.org
In-Reply-To: <81C3A93C17462B4BBD7E272753C105791696B5DCF0@EXDCVYMBSTM005.EQ1STM.local>

caif: tty's are kref objects so take a reference

From: Alan Cox <alan@linux.intel.com>

I don't think this can be abused in this case but do things properly.

Signed-off-by: Alan Cox <alan@linux.intel.com>
---

 drivers/net/caif/caif_serial.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)


diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 3502f60..b271aa0 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -315,7 +315,7 @@ static int ldisc_open(struct tty_struct *tty)
 	sprintf(name, "cf%s", tty->name);
 	dev = alloc_netdev(sizeof(*ser), name, caifdev_setup);
 	ser = netdev_priv(dev);
-	ser->tty = tty;
+	ser->tty = tty_kref_get(tty);
 	ser->dev = dev;
 	debugfs_init(ser, tty);
 	tty->receive_room = N_TTY_BUF_SIZE;
@@ -348,6 +348,7 @@ static void ldisc_close(struct tty_struct *tty)
 	unregister_netdevice(ser->dev);
 	list_del(&ser->node);
 	debugfs_deinit(ser);
+	tty_kref_put(ser->tty);
 	if (!islocked)
 		rtnl_unlock();
 }

^ permalink raw reply related

* [PATCH] CAIF: write check
From: Alan Cox @ 2010-04-07 13:17 UTC (permalink / raw)
  To: Sjur BRENDELAND; +Cc: netdev@vger.kernel.org
In-Reply-To: <81C3A93C17462B4BBD7E272753C105791696B5DCF0@EXDCVYMBSTM005.EQ1STM.local>

caif: check write operations

From: Alan Cox <alan@linux.intel.com>

write is optional for a tty device. Check that we have a write op rather
than calling NULL.

Signed-off-by: Alan Cox <alan@linux.intel.com>
---

 drivers/net/caif/caif_serial.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index b271aa0..38c0186 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -312,6 +312,10 @@ static int ldisc_open(struct tty_struct *tty)
 	char name[64];
 	int result;
 
+	/* No write no play */
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
+
 	sprintf(name, "cf%s", tty->name);
 	dev = alloc_netdev(sizeof(*ser), name, caifdev_setup);
 	ser = netdev_priv(dev);


^ permalink raw reply related

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-04-07 13:26 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BBB634D.6050109@trash.net>


On Tuesday 2010-04-06 18:37, Patrick McHardy wrote:
>Jan Engelhardt wrote:
>> On Thursday 2010-04-01 15:22, Patrick McHardy wrote:
>>>> Or should we be using skb_alloc and copying the data portion over, like 
>>>> ipt_REJECT does since v2.6.24-2931-g9ba99b0?
>>> I guess pskb_copy() would be most optimal since we can modify
>>> the header, but the non-linear area could be shared
>> 
>> Trying to improve my understanding: when doing skb_pull,
>> does the skb->head that is relevant for pskb_copy move?
>
>skb_pull() only changes skb->data.

But how does it interact, with, say, xt_TCPMSS which modifies not
only the L3 header, but also the L4 header?

^ permalink raw reply

* RE: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
From: John Linn @ 2010-04-07 13:25 UTC (permalink / raw)
  To: David Miller, eric.dumazet
  Cc: netdev, linuxppc-dev, grant.likely, jwboyer, john.williams,
	michal.simek, jtyner
In-Reply-To: <20100406.195204.165441511.davem@davemloft.net>

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, April 06, 2010 8:52 PM
> To: eric.dumazet@gmail.com
> Cc: John Linn; netdev@vger.kernel.org; linuxppc-dev@ozlabs.org;
grant.likely@secretlab.ca;
> jwboyer@linux.vnet.ibm.com; john.williams@petalogix.com;
michal.simek@petalogix.com; jtyner@cs.ucr.edu
> Subject: Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
> 
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 05 Apr 2010 23:29:53 +0200
> 
> > So, I ask, cant you use netdev_alloc_skb_ip_align() in this driver ?
> 
> Thanks to everyone for getting this patch into shape.
> 
> I applied version 4 of the patch to net-next-2.6, thanks!

Thanks David, appreciate everyone's help and patience.

This email and any attachments are intended for the sole use of the named recipient(s) and contain(s) confidential information that may be proprietary, privileged or copyrighted under applicable law. If you are not the intended recipient, do not read, copy, or forward this email message or any attachments. Delete this email message and any attachments immediately.



^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox