Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/9] netdev: bfin_mac: add support for IEEE 1588 PTP
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Barry Song

From: Barry Song <barry.song@analog.com>

Newer on-chip MAC peripherals support IEEE 1588 PTP in the hardware, so
extend the driver to support this functionality.

Signed-off-by: Barry Song <barry.song@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/Kconfig    |    7 +
 drivers/net/bfin_mac.c |  341 +++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/net/bfin_mac.h |   15 ++
 3 files changed, 362 insertions(+), 1 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 7b832c7..ac536b9 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -887,6 +887,13 @@ config BFIN_MAC_RMII
 	help
 	  Use Reduced PHY MII Interface
 
+config BFIN_MAC_USE_HWSTAMP
+	bool "Use IEEE 1588 hwstamp"
+	depends on BFIN_MAC && BF518
+	default y
+	help
+	  To support the IEEE 1588 Precision Time Protocol (PTP), select y here
+
 config SMC9194
 	tristate "SMC 9194 support"
 	depends on NET_VENDOR_SMC && (ISA || MAC && BROKEN)
diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 587f93c..6a9519f 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -33,6 +33,7 @@
 #include <asm/dma.h>
 #include <linux/dma-mapping.h>
 
+#include <asm/div64.h>
 #include <asm/dpmc.h>
 #include <asm/blackfin.h>
 #include <asm/cacheflush.h>
@@ -551,6 +552,309 @@ static int bfin_mac_set_mac_address(struct net_device *dev, void *p)
 	return 0;
 }
 
+#ifdef CONFIG_BFIN_MAC_USE_HWSTAMP
+#define bfin_mac_hwtstamp_is_none(cfg) ((cfg) == HWTSTAMP_FILTER_NONE)
+
+static int bfin_mac_hwtstamp_ioctl(struct net_device *netdev,
+		struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config config;
+	struct bfin_mac_local *lp = netdev_priv(netdev);
+	u16 ptpctl;
+	u32 ptpfv1, ptpfv2, ptpfv3, ptpfoff;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	pr_debug("%s config flag:0x%x, tx_type:0x%x, rx_filter:0x%x\n",
+			__func__, config.flags, config.tx_type, config.rx_filter);
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	if ((config.tx_type != HWTSTAMP_TX_OFF) &&
+			(config.tx_type != HWTSTAMP_TX_ON))
+		return -ERANGE;
+
+	ptpctl = bfin_read_EMAC_PTP_CTL();
+
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		/*
+		 * Dont allow any timestamping
+		 */
+		ptpfv3 = 0xFFFFFFFF;
+		bfin_write_EMAC_PTP_FV3(ptpfv3);
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+		/*
+		 * Clear the five comparison mask bits (bits[12:8]) in EMAC_PTP_CTL)
+		 * to enable all the field matches.
+		 */
+		ptpctl &= ~0x1F00;
+		bfin_write_EMAC_PTP_CTL(ptpctl);
+		/*
+		 * Keep the default values of the EMAC_PTP_FOFF register.
+		 */
+		ptpfoff = 0x4A24170C;
+		bfin_write_EMAC_PTP_FOFF(ptpfoff);
+		/*
+		 * Keep the default values of the EMAC_PTP_FV1 and EMAC_PTP_FV2
+		 * registers.
+		 */
+		ptpfv1 = 0x11040800;
+		bfin_write_EMAC_PTP_FV1(ptpfv1);
+		ptpfv2 = 0x0140013F;
+		bfin_write_EMAC_PTP_FV2(ptpfv2);
+		/*
+		 * The default value (0xFFFC) allows the timestamping of both
+		 * received Sync messages and Delay_Req messages.
+		 */
+		ptpfv3 = 0xFFFFFFFC;
+		bfin_write_EMAC_PTP_FV3(ptpfv3);
+
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		/* Clear all five comparison mask bits (bits[12:8]) in the
+		 * EMAC_PTP_CTL register to enable all the field matches.
+		 */
+		ptpctl &= ~0x1F00;
+		bfin_write_EMAC_PTP_CTL(ptpctl);
+		/*
+		 * Keep the default values of the EMAC_PTP_FOFF register, except set
+		 * the PTPCOF field to 0x2A.
+		 */
+		ptpfoff = 0x2A24170C;
+		bfin_write_EMAC_PTP_FOFF(ptpfoff);
+		/*
+		 * Keep the default values of the EMAC_PTP_FV1 and EMAC_PTP_FV2
+		 * registers.
+		 */
+		ptpfv1 = 0x11040800;
+		bfin_write_EMAC_PTP_FV1(ptpfv1);
+		ptpfv2 = 0x0140013F;
+		bfin_write_EMAC_PTP_FV2(ptpfv2);
+		/*
+		 * To allow the timestamping of Pdelay_Req and Pdelay_Resp, set
+		 * the value to 0xFFF0.
+		 */
+		ptpfv3 = 0xFFFFFFF0;
+		bfin_write_EMAC_PTP_FV3(ptpfv3);
+
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+		/*
+		 * Clear bits 8 and 12 of the EMAC_PTP_CTL register to enable only the
+		 * EFTM and PTPCM field comparison.
+		 */
+		ptpctl &= ~0x1100;
+		bfin_write_EMAC_PTP_CTL(ptpctl);
+		/*
+		 * Keep the default values of all the fields of the EMAC_PTP_FOFF
+		 * register, except set the PTPCOF field to 0x0E.
+		 */
+		ptpfoff = 0x0E24170C;
+		bfin_write_EMAC_PTP_FOFF(ptpfoff);
+		/*
+		 * Program bits [15:0] of the EMAC_PTP_FV1 register to 0x88F7, which
+		 * corresponds to PTP messages on the MAC layer.
+		 */
+		ptpfv1 = 0x110488F7;
+		bfin_write_EMAC_PTP_FV1(ptpfv1);
+		ptpfv2 = 0x0140013F;
+		bfin_write_EMAC_PTP_FV2(ptpfv2);
+		/*
+		 * To allow the timestamping of Pdelay_Req and Pdelay_Resp
+		 * messages, set the value to 0xFFF0.
+		 */
+		ptpfv3 = 0xFFFFFFF0;
+		bfin_write_EMAC_PTP_FV3(ptpfv3);
+
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (config.tx_type == HWTSTAMP_TX_OFF &&
+	    bfin_mac_hwtstamp_is_none(config.rx_filter)) {
+		ptpctl &= ~PTP_EN;
+		bfin_write_EMAC_PTP_CTL(ptpctl);
+
+		SSYNC();
+	} else {
+		ptpctl |= PTP_EN;
+		bfin_write_EMAC_PTP_CTL(ptpctl);
+
+		/*
+		 * clear any existing timestamp
+		 */
+		bfin_read_EMAC_PTP_RXSNAPLO();
+		bfin_read_EMAC_PTP_RXSNAPHI();
+
+		bfin_read_EMAC_PTP_TXSNAPLO();
+		bfin_read_EMAC_PTP_TXSNAPHI();
+
+		/*
+		 * Set registers so that rollover occurs soon to test this.
+		 */
+		bfin_write_EMAC_PTP_TIMELO(0x00000000);
+		bfin_write_EMAC_PTP_TIMEHI(0xFF800000);
+
+		SSYNC();
+
+		lp->compare.last_update = 0;
+		timecounter_init(&lp->clock,
+				&lp->cycles,
+				ktime_to_ns(ktime_get_real()));
+		timecompare_update(&lp->compare, 0);
+	}
+
+	lp->stamp_cfg = config;
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+static void bfin_dump_hwtamp(char *s, ktime_t *hw, ktime_t *ts, struct timecompare *cmp)
+{
+	ktime_t sys = ktime_get_real();
+
+	pr_debug("%s %s hardware:%d,%d transform system:%d,%d system:%d,%d, cmp:%lld, %lld\n",
+			__func__, s, hw->tv.sec, hw->tv.nsec, ts->tv.sec, ts->tv.nsec, sys.tv.sec,
+			sys.tv.nsec, cmp->offset, cmp->skew);
+}
+
+static void bfin_tx_hwtstamp(struct net_device *netdev, struct sk_buff *skb)
+{
+	struct bfin_mac_local *lp = netdev_priv(netdev);
+	union skb_shared_tx *shtx = skb_tx(skb);
+
+	if (shtx->hardware) {
+		int timeout_cnt = MAX_TIMEOUT_CNT;
+
+		/* When doing time stamping, keep the connection to the socket
+		 * a while longer
+		 */
+		shtx->in_progress = 1;
+
+		/*
+		 * The timestamping is done at the EMAC module's MII/RMII interface
+		 * when the module sees the Start of Frame of an event message packet. This
+		 * interface is the closest possible place to the physical Ethernet transmission
+		 * medium, providing the best timing accuracy.
+		 */
+		while ((!(bfin_read_EMAC_PTP_ISTAT() & TXTL)) && (--timeout_cnt))
+			udelay(1);
+		if (timeout_cnt == 0)
+			printk(KERN_ERR DRV_NAME
+					": fails to timestamp the TX packet\n");
+		else {
+			struct skb_shared_hwtstamps shhwtstamps;
+			u64 ns;
+			u64 regval;
+
+			regval = bfin_read_EMAC_PTP_TXSNAPLO();
+			regval |= (u64)bfin_read_EMAC_PTP_TXSNAPHI() << 32;
+			memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+			ns = timecounter_cyc2time(&lp->clock,
+					regval);
+			timecompare_update(&lp->compare, ns);
+			shhwtstamps.hwtstamp = ns_to_ktime(ns);
+			shhwtstamps.syststamp =
+				timecompare_transform(&lp->compare, ns);
+			skb_tstamp_tx(skb, &shhwtstamps);
+
+			bfin_dump_hwtamp("TX", &shhwtstamps.hwtstamp, &shhwtstamps.syststamp, &lp->compare);
+		}
+	}
+}
+
+static void bfin_rx_hwtstamp(struct net_device *netdev, struct sk_buff *skb)
+{
+	struct bfin_mac_local *lp = netdev_priv(netdev);
+	u32 valid;
+	u64 regval, ns;
+	struct skb_shared_hwtstamps *shhwtstamps;
+
+	if (bfin_mac_hwtstamp_is_none(lp->stamp_cfg.rx_filter))
+		return;
+
+	valid = bfin_read_EMAC_PTP_ISTAT() & RXEL;
+	if (!valid)
+		return;
+
+	shhwtstamps = skb_hwtstamps(skb);
+
+	regval = bfin_read_EMAC_PTP_RXSNAPLO();
+	regval |= (u64)bfin_read_EMAC_PTP_RXSNAPHI() << 32;
+	ns = timecounter_cyc2time(&lp->clock, regval);
+	timecompare_update(&lp->compare, ns);
+	memset(shhwtstamps, 0, sizeof(*shhwtstamps));
+	shhwtstamps->hwtstamp = ns_to_ktime(ns);
+	shhwtstamps->syststamp = timecompare_transform(&lp->compare, ns);
+
+	bfin_dump_hwtamp("RX", &shhwtstamps->hwtstamp, &shhwtstamps->syststamp, &lp->compare);
+}
+
+/*
+ * bfin_read_clock - read raw cycle counter (to be used by time counter)
+ */
+static cycle_t bfin_read_clock(const struct cyclecounter *tc)
+{
+	u64 stamp;
+
+	stamp =  bfin_read_EMAC_PTP_TIMELO();
+	stamp |= (u64)bfin_read_EMAC_PTP_TIMEHI() << 32ULL;
+
+	return stamp;
+}
+
+#define PTP_CLK 25000000
+
+static void bfin_mac_hwtstamp_init(struct net_device *netdev)
+{
+	struct bfin_mac_local *lp = netdev_priv(netdev);
+	u64 append;
+
+	/* Initialize hardware timer */
+	append = PTP_CLK * (1ULL << 32);
+	do_div(append, get_sclk());
+	bfin_write_EMAC_PTP_ADDEND((u32)append);
+
+	memset(&lp->cycles, 0, sizeof(lp->cycles));
+	lp->cycles.read = bfin_read_clock;
+	lp->cycles.mask = CLOCKSOURCE_MASK(64);
+	lp->cycles.mult = 1000000000 / PTP_CLK;
+	lp->cycles.shift = 0;
+
+	/* Synchronize our NIC clock against system wall clock */
+	memset(&lp->compare, 0, sizeof(lp->compare));
+	lp->compare.source = &lp->clock;
+	lp->compare.target = ktime_get_real;
+	lp->compare.num_samples = 10;
+
+	/* Initialize hwstamp config */
+	lp->stamp_cfg.rx_filter = HWTSTAMP_FILTER_NONE;
+	lp->stamp_cfg.tx_type = HWTSTAMP_TX_OFF;
+}
+
+#else
+# define bfin_mac_hwtstamp_is_none(cfg) 0
+# define bfin_mac_hwtstamp_init(dev)
+# define bfin_mac_hwtstamp_ioctl(dev, ifr, cmd) (-EOPNOTSUPP)
+# define bfin_rx_hwtstamp(dev, skb)
+# define bfin_tx_hwtstamp(dev, skb)
+#endif
+
 static void adjust_tx_list(void)
 {
 	int timeout_cnt = MAX_TIMEOUT_CNT;
@@ -608,18 +912,32 @@ static int bfin_mac_hard_start_xmit(struct sk_buff *skb,
 {
 	u16 *data;
 	u32 data_align = (unsigned long)(skb->data) & 0x3;
+	union skb_shared_tx *shtx = skb_tx(skb);
+
 	current_tx_ptr->skb = skb;
 
 	if (data_align == 0x2) {
 		/* move skb->data to current_tx_ptr payload */
 		data = (u16 *)(skb->data) - 1;
-				*data = (u16)(skb->len);
+		*data = (u16)(skb->len);
+		/*
+		 * When transmitting an Ethernet packet, the PTP_TSYNC module requires
+		 * a DMA_Length_Word field associated with the packet. The lower 12 bits
+		 * of this field are the length of the packet payload in bytes and the higher
+		 * 4 bits are the timestamping enable field.
+		 */
+		if (shtx->hardware)
+			*data |= 0x1000;
+
 		current_tx_ptr->desc_a.start_addr = (u32)data;
 		/* this is important! */
 		blackfin_dcache_flush_range((u32)data,
 				(u32)((u8 *)data + skb->len + 4));
 	} else {
 		*((u16 *)(current_tx_ptr->packet)) = (u16)(skb->len);
+		/* enable timestamping for the sent packet */
+		if (shtx->hardware)
+			*((u16 *)(current_tx_ptr->packet)) |= 0x1000;
 		memcpy((u8 *)(current_tx_ptr->packet + 2), skb->data,
 			skb->len);
 		current_tx_ptr->desc_a.start_addr =
@@ -653,6 +971,9 @@ static int bfin_mac_hard_start_xmit(struct sk_buff *skb,
 
 out:
 	adjust_tx_list();
+
+	bfin_tx_hwtstamp(dev, skb);
+
 	current_tx_ptr = current_tx_ptr->next;
 	dev->trans_start = jiffies;
 	dev->stats.tx_packets++;
@@ -664,9 +985,11 @@ static void bfin_mac_rx(struct net_device *dev)
 {
 	struct sk_buff *skb, *new_skb;
 	unsigned short len;
+	struct bfin_mac_local *lp __maybe_unused = netdev_priv(dev);
 
 	/* allocate a new skb for next time receive */
 	skb = current_rx_ptr->skb;
+
 	new_skb = dev_alloc_skb(PKT_BUF_SZ + NET_IP_ALIGN);
 	if (!new_skb) {
 		printk(KERN_NOTICE DRV_NAME
@@ -691,6 +1014,9 @@ static void bfin_mac_rx(struct net_device *dev)
 					 (unsigned long)skb->tail);
 
 	skb->protocol = eth_type_trans(skb, dev);
+
+	bfin_rx_hwtstamp(dev, skb);
+
 #if defined(BFIN_MAC_CSUM_OFFLOAD)
 	skb->csum = current_rx_ptr->status.ip_payload_csum;
 	skb->ip_summed = CHECKSUM_COMPLETE;
@@ -874,6 +1200,16 @@ static void bfin_mac_set_multicast_list(struct net_device *dev)
 	}
 }
 
+static int bfin_mac_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return bfin_mac_hwtstamp_ioctl(netdev, ifr, cmd);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 /*
  * this puts the device in an inactive state
  */
@@ -958,6 +1294,7 @@ static const struct net_device_ops bfin_mac_netdev_ops = {
 	.ndo_set_mac_address	= bfin_mac_set_mac_address,
 	.ndo_tx_timeout		= bfin_mac_timeout,
 	.ndo_set_multicast_list	= bfin_mac_set_multicast_list,
+	.ndo_do_ioctl           = bfin_mac_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= eth_change_mtu,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -1049,6 +1386,8 @@ static int __devinit bfin_mac_probe(struct platform_device *pdev)
 		goto out_err_reg_ndev;
 	}
 
+	bfin_mac_hwtstamp_init(ndev);
+
 	/* now, print out the card info, in a short format.. */
 	dev_info(&pdev->dev, "%s, Version %s\n", DRV_DESC, DRV_VERSION);
 
diff --git a/drivers/net/bfin_mac.h b/drivers/net/bfin_mac.h
index 052b5dc..87c454f 100644
--- a/drivers/net/bfin_mac.h
+++ b/drivers/net/bfin_mac.h
@@ -7,6 +7,12 @@
  *
  * Licensed under the GPL-2 or later.
  */
+#ifndef _BFIN_MAC_H_
+#define _BFIN_MAC_H_
+
+#include <linux/net_tstamp.h>
+#include <linux/clocksource.h>
+#include <linux/timecompare.h>
 
 #define BFIN_MAC_CSUM_OFFLOAD
 
@@ -67,6 +73,15 @@ struct bfin_mac_local {
 
 	struct phy_device *phydev;
 	struct mii_bus *mii_bus;
+
+#if defined(CONFIG_BFIN_MAC_USE_HWSTAMP)
+	struct cyclecounter cycles;
+	struct timecounter clock;
+	struct timecompare compare;
+	struct hwtstamp_config stamp_cfg;
+#endif
 };
 
 extern void bfin_get_ether_addr(char *addr);
+
+#endif
-- 
1.7.1


^ permalink raw reply related

* [PATCH 2/9] netdev: bfin_mac: handler RX status errors
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Peter Meerwald, Graf Yang
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Peter Meerwald <pmeerw@pmeerw.net>

Signed-off-by: Peter Meerwald <pmeerw@pmeerw.net>
Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |   17 ++++++++++++++---
 1 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 6a9519f..c888465 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -981,12 +981,25 @@ out:
 	return NETDEV_TX_OK;
 }
 
+#define RX_ERROR_MASK (RX_LONG | RX_ALIGN | RX_CRC | RX_LEN | \
+	RX_FRAG | RX_ADDR | RX_DMAO | RX_PHY | RX_LATE | RX_RANGE)
+
 static void bfin_mac_rx(struct net_device *dev)
 {
 	struct sk_buff *skb, *new_skb;
 	unsigned short len;
 	struct bfin_mac_local *lp __maybe_unused = netdev_priv(dev);
 
+	/* check if frame status word reports an error condition
+	 * we which case we simply drop the packet
+	 */
+	if (current_rx_ptr->status.status_word & RX_ERROR_MASK) {
+		printk(KERN_NOTICE DRV_NAME
+		       ": rx: receive error - packet dropped\n");
+		dev->stats.rx_dropped++;
+		goto out;
+	}
+
 	/* allocate a new skb for next time receive */
 	skb = current_rx_ptr->skb;
 
@@ -1025,11 +1038,9 @@ static void bfin_mac_rx(struct net_device *dev)
 	netif_rx(skb);
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += len;
+out:
 	current_rx_ptr->status.status_word = 0x00000000;
 	current_rx_ptr = current_rx_ptr->next;
-
-out:
-	return;
 }
 
 /* interrupt routine to handle rx and error signal */
-- 
1.7.1


^ permalink raw reply related

* [PATCH 3/9] netdev: bfin_mac: invalid data cache only once for each new rx skb buffer
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Sonic Zhang
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Sonic Zhang <sonic.zhang@analog.com>

The skb buffer isn't actually used until we finish transferring and pass
it up to higher layers, so only invalidate the range once before we start
receiving actual data.  This also avoids the problem with data invalidating
on Blackfin systems -- there is no invalidate-only, just invalidate+flush.
So when running in writeback mode, there is the small (but not uncommon)
possibility of the flush overwriting valid DMA-ed data from the cache.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |   13 ++++++++-----
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index c888465..f9ba598 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -203,6 +203,11 @@ static int desc_list_init(void)
 			goto init_error;
 		}
 		skb_reserve(new_skb, NET_IP_ALIGN);
+		/* Invidate the data cache of skb->data range when it is write back
+		 * cache. It will prevent overwritting the new data from DMA
+		 */
+		blackfin_dcache_invalidate_range((unsigned long)new_skb->head,
+					 (unsigned long)new_skb->end);
 		r->skb = new_skb;

 		/*
@@ -1012,19 +1017,17 @@ static void bfin_mac_rx(struct net_device *dev)
 	}
 	/* reserve 2 bytes for RXDWA padding */
 	skb_reserve(new_skb, NET_IP_ALIGN);
-	current_rx_ptr->skb = new_skb;
-	current_rx_ptr->desc_a.start_addr = (unsigned long)new_skb->data - 2;
-
 	/* Invidate the data cache of skb->data range when it is write back
 	 * cache. It will prevent overwritting the new data from DMA
 	 */
 	blackfin_dcache_invalidate_range((unsigned long)new_skb->head,
 					 (unsigned long)new_skb->end);

+	current_rx_ptr->skb = new_skb;
+	current_rx_ptr->desc_a.start_addr = (unsigned long)new_skb->data - 2;
+
 	len = (unsigned short)((current_rx_ptr->status.status_word) & RX_FRLEN);
 	skb_put(skb, len);
-	blackfin_dcache_invalidate_range((unsigned long)skb->head,
-					 (unsigned long)skb->tail);

 	skb->protocol = eth_type_trans(skb, dev);

-- 
1.7.1

^ permalink raw reply related

* [PATCH 4/9 v2] netdev: bfin_mac: deduce Ethernet FCS from hardware IP payload checksum
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Sonic Zhang, Jon Kowal
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Sonic Zhang <sonic.zhang@analog.com>

IP checksum is based on 16-bit one's complement algorithm, so to deduce a
value from checksum is equal to add its complement.

Unfortunately, the Blackfin on-chip MAC checksum logic only works when the
IP packet has a header length of 20 bytes.  This is true for most IPv4
packets, but not for IPv6 packets or IPv4 packets which use header options.
So only use the hardware checksum when appropriate.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Jon Kowal <jon.kowal@dspecialists.de>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
v2
	- folded the ipv4 checks into this patch

 drivers/net/bfin_mac.c |   35 +++++++++++++++++++++++++++++++++--
 1 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index f9ba598..b57b5ac 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -986,6 +986,7 @@ out:
 	return NETDEV_TX_OK;
 }
 
+#define IP_HEADER_OFF  0
 #define RX_ERROR_MASK (RX_LONG | RX_ALIGN | RX_CRC | RX_LEN | \
 	RX_FRAG | RX_ADDR | RX_DMAO | RX_PHY | RX_LATE | RX_RANGE)
 
@@ -994,6 +995,10 @@ static void bfin_mac_rx(struct net_device *dev)
 	struct sk_buff *skb, *new_skb;
 	unsigned short len;
 	struct bfin_mac_local *lp __maybe_unused = netdev_priv(dev);
+#if defined(BFIN_MAC_CSUM_OFFLOAD)
+	unsigned int i;
+	unsigned char fcs[ETH_FCS_LEN + 1];
+#endif
 
 	/* check if frame status word reports an error condition
 	 * we which case we simply drop the packet
@@ -1027,6 +1032,8 @@ static void bfin_mac_rx(struct net_device *dev)
 	current_rx_ptr->desc_a.start_addr = (unsigned long)new_skb->data - 2;
 
 	len = (unsigned short)((current_rx_ptr->status.status_word) & RX_FRLEN);
+	/* Deduce Ethernet FCS length from Ethernet payload length */
+	len -= ETH_FCS_LEN;
 	skb_put(skb, len);
 
 	skb->protocol = eth_type_trans(skb, dev);
@@ -1034,8 +1041,32 @@ static void bfin_mac_rx(struct net_device *dev)
 	bfin_rx_hwtstamp(dev, skb);
 
 #if defined(BFIN_MAC_CSUM_OFFLOAD)
-	skb->csum = current_rx_ptr->status.ip_payload_csum;
-	skb->ip_summed = CHECKSUM_COMPLETE;
+	/* Checksum offloading only works for IPv4 packets with the standard IP header
+	 * length of 20 bytes, because the blackfin MAC checksum calculation is
+	 * based on that assumption. We must NOT use the calculated checksum if our
+	 * IP version or header break that assumption.
+	 */
+	if (skb->data[IP_HEADER_OFF] == 0x45) {
+		skb->csum = current_rx_ptr->status.ip_payload_csum;
+		/*
+		 * Deduce Ethernet FCS from hardware generated IP payload checksum.
+		 * IP checksum is based on 16-bit one's complement algorithm.
+		 * To deduce a value from checksum is equal to add its inversion.
+		 * If the IP payload len is odd, the inversed FCS should also
+		 * begin from odd address and leave first byte zero.
+		 */
+		if (skb->len % 2) {
+			fcs[0] = 0;
+			for (i = 0; i < ETH_FCS_LEN; i++)
+				fcs[i + 1] = ~skb->data[skb->len + i];
+			skb->csum = csum_partial(fcs, ETH_FCS_LEN + 1, skb->csum);
+		} else {
+			for (i = 0; i < ETH_FCS_LEN; i++)
+				fcs[i] = ~skb->data[skb->len + i];
+			skb->csum = csum_partial(fcs, ETH_FCS_LEN, skb->csum);
+		}
+		skb->ip_summed = CHECKSUM_COMPLETE;
+	}
 #endif
 
 	netif_rx(skb);
-- 
1.7.1


^ permalink raw reply related

* [PATCH 5/9] netdev: bfin_mac: clear RXCKS if hardware generated checksum is not enabled
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Sonic Zhang
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Sonic Zhang <sonic.zhang@analog.com>

Otherwise we might be get a setting mismatch from a previous module or
bootloader and what the driver currently expects.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index b57b5ac..bee5d7a 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -515,10 +515,11 @@ void setup_system_regs(struct net_device *dev)
 	 * Configure checksum support and rcve frame word alignment
 	 */
 	sysctl = bfin_read_EMAC_SYSCTL();
+	sysctl |= RXDWA;
 #if defined(BFIN_MAC_CSUM_OFFLOAD)
-	sysctl |= RXDWA | RXCKS;
+	sysctl |= RXCKS;
 #else
-	sysctl |= RXDWA;
+	sysctl &= ~RXCKS;
 #endif
 	bfin_write_EMAC_SYSCTL(sysctl);

-- 
1.7.1

^ permalink raw reply related

* [PATCH 6/9] netdev: bfin_mac: add support for wake-on-lan magic packets
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Michael Hennerich
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Michael Hennerich <michael.hennerich@analog.com>

Note that WOL works only in PM Suspend Standby Mode (Sleep Mode).

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |   76 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/net/bfin_mac.h |    3 ++
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index bee5d7a..967d018 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -464,6 +464,14 @@ static int mii_probe(struct net_device *dev)
  * Ethtool support
  */
 
+/*
+ * interrupt routine for magic packet wakeup
+ */
+static irqreturn_t bfin_mac_wake_interrupt(int irq, void *dev_id)
+{
+	return IRQ_HANDLED;
+}
+
 static int
 bfin_mac_ethtool_getsettings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
@@ -498,11 +506,57 @@ static void bfin_mac_ethtool_getdrvinfo(struct net_device *dev,
 	strcpy(info->bus_info, dev_name(&dev->dev));
 }
 
+static void bfin_mac_ethtool_getwol(struct net_device *dev,
+	struct ethtool_wolinfo *wolinfo)
+{
+	struct bfin_mac_local *lp = netdev_priv(dev);
+
+	wolinfo->supported = WAKE_MAGIC;
+	wolinfo->wolopts = lp->wol;
+}
+
+static int bfin_mac_ethtool_setwol(struct net_device *dev,
+	struct ethtool_wolinfo *wolinfo)
+{
+	struct bfin_mac_local *lp = netdev_priv(dev);
+	int rc;
+
+	if (wolinfo->wolopts & (WAKE_MAGICSECURE |
+				WAKE_UCAST |
+				WAKE_MCAST |
+				WAKE_BCAST |
+				WAKE_ARP))
+		return -EOPNOTSUPP;
+
+	lp->wol = wolinfo->wolopts;
+
+	if (lp->wol && !lp->irq_wake_requested) {
+		/* register wake irq handler */
+		rc = request_irq(IRQ_MAC_WAKEDET, bfin_mac_wake_interrupt,
+				 IRQF_DISABLED, "EMAC_WAKE", dev);
+		if (rc)
+			return rc;
+		lp->irq_wake_requested = true;
+	}
+
+	if (!lp->wol && lp->irq_wake_requested) {
+		free_irq(IRQ_MAC_WAKEDET, dev);
+		lp->irq_wake_requested = false;
+	}
+
+	/* Make sure the PHY driver doesn't suspend */
+	device_init_wakeup(&dev->dev, lp->wol);
+
+	return 0;
+}
+
 static const struct ethtool_ops bfin_mac_ethtool_ops = {
 	.get_settings = bfin_mac_ethtool_getsettings,
 	.set_settings = bfin_mac_ethtool_setsettings,
 	.get_link = ethtool_op_get_link,
 	.get_drvinfo = bfin_mac_ethtool_getdrvinfo,
+	.get_wol = bfin_mac_ethtool_getwol,
+	.set_wol = bfin_mac_ethtool_setwol,
 };
 
 /**************************************************************************/
@@ -1477,9 +1531,16 @@ static int __devexit bfin_mac_remove(struct platform_device *pdev)
 static int bfin_mac_suspend(struct platform_device *pdev, pm_message_t mesg)
 {
 	struct net_device *net_dev = platform_get_drvdata(pdev);
+	struct bfin_mac_local *lp = netdev_priv(net_dev);
 
-	if (netif_running(net_dev))
-		bfin_mac_close(net_dev);
+	if (lp->wol) {
+		bfin_write_EMAC_OPMODE((bfin_read_EMAC_OPMODE() & ~TE) | RE);
+		bfin_write_EMAC_WKUP_CTL(MPKE);
+		enable_irq_wake(IRQ_MAC_WAKEDET);
+	} else {
+		if (netif_running(net_dev))
+			bfin_mac_close(net_dev);
+	}
 
 	return 0;
 }
@@ -1487,9 +1548,16 @@ static int bfin_mac_suspend(struct platform_device *pdev, pm_message_t mesg)
 static int bfin_mac_resume(struct platform_device *pdev)
 {
 	struct net_device *net_dev = platform_get_drvdata(pdev);
+	struct bfin_mac_local *lp = netdev_priv(net_dev);
 
-	if (netif_running(net_dev))
-		bfin_mac_open(net_dev);
+	if (lp->wol) {
+		bfin_write_EMAC_OPMODE(bfin_read_EMAC_OPMODE() | TE);
+		bfin_write_EMAC_WKUP_CTL(0);
+		disable_irq_wake(IRQ_MAC_WAKEDET);
+	} else {
+		if (netif_running(net_dev))
+			bfin_mac_open(net_dev);
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bfin_mac.h b/drivers/net/bfin_mac.h
index 87c454f..1ae7b82 100644
--- a/drivers/net/bfin_mac.h
+++ b/drivers/net/bfin_mac.h
@@ -66,6 +66,9 @@ struct bfin_mac_local {
 	unsigned char Mac[6];	/* MAC address of the board */
 	spinlock_t lock;
 
+	int wol;		/* Wake On Lan */
+	int irq_wake_requested;
+
 	/* MII and PHY stuffs */
 	int old_link;          /* used by bf537_adjust_link */
 	int old_speed;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 7/9] netdev: bfin_mac: use promiscuous flag for promiscuous mode
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Sonic Zhang
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Sonic Zhang <sonic.zhang@analog.com>

Rather than using the Receive All Frames (RAF) bit to enable promiscuous
mode, use the Promiscuous (PR) bit.  This lowers overhead at runtime as
we let the hardware process the packets that should actually be checked.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 967d018..6f5ee1a 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -1279,7 +1279,7 @@ static void bfin_mac_set_multicast_list(struct net_device *dev)
 	if (dev->flags & IFF_PROMISC) {
 		printk(KERN_INFO "%s: set to promisc mode\n", dev->name);
 		sysctl = bfin_read_EMAC_OPMODE();
-		sysctl |= RAF;
+		sysctl |= PR;
 		bfin_write_EMAC_OPMODE(sysctl);
 	} else if (dev->flags & IFF_ALLMULTI) {
 		/* accept all multicast */
-- 
1.7.1

^ permalink raw reply related

* [PATCH 8/9] netdev: bfin_mac: handle timeouts with the MDIO registers gracefully
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

Have the low level MDIO functions pass back up timeout information so we
don't waste time polling them multiple times when there is a problem, and
so we don't let higher layers think the device is available when it isn't.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |   53 ++++++++++++++++++++++++++++++-----------------
 1 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 6f5ee1a..0b62dbf 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -81,9 +81,6 @@ static u16 pin_req[] = P_RMII0;
 static u16 pin_req[] = P_MII0;
 #endif
 
-static void bfin_mac_disable(void);
-static void bfin_mac_enable(void);
-
 static void desc_list_free(void)
 {
 	struct net_dma_desc_rx *r;
@@ -260,7 +257,7 @@ init_error:
  * MII operations
  */
 /* Wait until the previous MDC/MDIO transaction has completed */
-static void bfin_mdio_poll(void)
+static int bfin_mdio_poll(void)
 {
 	int timeout_cnt = MAX_TIMEOUT_CNT;
 
@@ -270,22 +267,30 @@ static void bfin_mdio_poll(void)
 		if (timeout_cnt-- < 0) {
 			printk(KERN_ERR DRV_NAME
 			": wait MDC/MDIO transaction to complete timeout\n");
-			break;
+			return -ETIMEDOUT;
 		}
 	}
+
+	return 0;
 }
 
 /* Read an off-chip register in a PHY through the MDC/MDIO port */
 static int bfin_mdiobus_read(struct mii_bus *bus, int phy_addr, int regnum)
 {
-	bfin_mdio_poll();
+	int ret;
+
+	ret = bfin_mdio_poll();
+	if (ret)
+		return ret;
 
 	/* read mode */
 	bfin_write_EMAC_STAADD(SET_PHYAD((u16) phy_addr) |
 				SET_REGAD((u16) regnum) |
 				STABUSY);
 
-	bfin_mdio_poll();
+	ret = bfin_mdio_poll();
+	if (ret)
+		return ret;
 
 	return (int) bfin_read_EMAC_STADAT();
 }
@@ -294,7 +299,11 @@ static int bfin_mdiobus_read(struct mii_bus *bus, int phy_addr, int regnum)
 static int bfin_mdiobus_write(struct mii_bus *bus, int phy_addr, int regnum,
 			      u16 value)
 {
-	bfin_mdio_poll();
+	int ret;
+
+	ret = bfin_mdio_poll();
+	if (ret)
+		return ret;
 
 	bfin_write_EMAC_STADAT((u32) value);
 
@@ -304,9 +313,7 @@ static int bfin_mdiobus_write(struct mii_bus *bus, int phy_addr, int regnum,
 				STAOP |
 				STABUSY);
 
-	bfin_mdio_poll();
-
-	return 0;
+	return bfin_mdio_poll();
 }
 
 static int bfin_mdiobus_reset(struct mii_bus *bus)
@@ -1181,8 +1188,9 @@ static void bfin_mac_disable(void)
 /*
  * Enable Interrupts, Receive, and Transmit
  */
-static void bfin_mac_enable(void)
+static int bfin_mac_enable(void)
 {
+	int ret;
 	u32 opmode;
 
 	pr_debug("%s: %s\n", DRV_NAME, __func__);
@@ -1192,7 +1200,9 @@ static void bfin_mac_enable(void)
 	bfin_write_DMA1_CONFIG(rx_list_head->desc_a.config);
 
 	/* Wait MII done */
-	bfin_mdio_poll();
+	ret = bfin_mdio_poll();
+	if (ret)
+		return ret;
 
 	/* We enable only RX here */
 	/* ASTP   : Enable Automatic Pad Stripping
@@ -1216,6 +1226,8 @@ static void bfin_mac_enable(void)
 #endif
 	/* Turn on the EMAC rx */
 	bfin_write_EMAC_OPMODE(opmode);
+
+	return 0;
 }
 
 /* Our watchdog timed out. Called by the networking layer */
@@ -1330,7 +1342,7 @@ static void bfin_mac_shutdown(struct net_device *dev)
 static int bfin_mac_open(struct net_device *dev)
 {
 	struct bfin_mac_local *lp = netdev_priv(dev);
-	int retval;
+	int ret;
 	pr_debug("%s: %s\n", dev->name, __func__);
 
 	/*
@@ -1344,18 +1356,21 @@ static int bfin_mac_open(struct net_device *dev)
 	}
 
 	/* initial rx and tx list */
-	retval = desc_list_init();
-
-	if (retval)
-		return retval;
+	ret = desc_list_init();
+	if (ret)
+		return ret;
 
 	phy_start(lp->phydev);
 	phy_write(lp->phydev, MII_BMCR, BMCR_RESET);
 	setup_system_regs(dev);
 	setup_mac_addr(dev->dev_addr);
+
 	bfin_mac_disable();
-	bfin_mac_enable();
+	ret = bfin_mac_enable();
+	if (ret)
+		return ret;
 	pr_debug("hardware init finished\n");
+
 	netif_start_queue(dev);
 	netif_carrier_on(dev);
 
-- 
1.7.1


^ permalink raw reply related

* [PATCH 9/9] netdev: bfin_mac: check for mii_bus platform data
From: Mike Frysinger @ 2010-05-10 15:39 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: uclinux-dist-devel, Sonic Zhang
In-Reply-To: <1273505954-32588-1-git-send-email-vapier@gentoo.org>

From: Sonic Zhang <sonic.zhang@analog.com>

If the platform data for the mii_bus is missing, gracefully error out
rather than deference NULL pointers.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
---
 drivers/net/bfin_mac.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 0b62dbf..0f0aa57 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -1469,6 +1469,11 @@ static int __devinit bfin_mac_probe(struct platform_device *pdev)
 	}
 	pd = pdev->dev.platform_data;
 	lp->mii_bus = platform_get_drvdata(pd);
+	if (!lp->mii_bus) {
+		dev_err(&pdev->dev, "Cannot get mii_bus!\n");
+		rc = -ENODEV;
+		goto out_err_mii_bus_probe;
+	}
 	lp->mii_bus->priv = ndev;
 
 	rc = mii_probe(ndev);
@@ -1514,6 +1519,7 @@ out_err_request_irq:
 out_err_mii_probe:
 	mdiobus_unregister(lp->mii_bus);
 	mdiobus_free(lp->mii_bus);
+out_err_mii_bus_probe:
 	peripheral_free_list(pin_req);
 out_err_probe_mac:
 	platform_set_drvdata(pdev, NULL);
-- 
1.7.1


^ permalink raw reply related

* Re: mmotm 2010-04-28 - RCU whinges
From: Patrick McHardy @ 2010-05-10 15:40 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <20100502.225509.233880079.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 897 bytes --]

David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 03 May 2010 07:43:56 +0200
> 
>> Le lundi 03 mai 2010 à 07:41 +0200, Eric Dumazet a écrit :
>>
>>> Oops scratch that, I'll resend a correct version.
>>>
>>>
>> Sorry, patch _is_ fine, I had one brain collapse when re-reading it, I
>> thought a different mutex was in use in one of the functions.
> 
> Ok, Patrick please review, thanks.

Actually we don't need the rcu_dereference() calls at all since
registration and unregistration are protected by the mutexes.

I queued this patch in nf-next, the only reason why I haven't
submitted it yet is that I was unable to get git to cleanly export
only the proper set of patches meant for -next due to a few merges,
it insists on including 5 patches already merged upstream. If you
don't mind ignoring the first 5 patches in the series, I'll send a
pull request tonight.



[-- Attachment #2: x --]
[-- Type: text/plain, Size: 3840 bytes --]

commit ed86308f6179d8fa6151c2d0f652aad0091548e2
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Apr 9 16:42:15 2010 +0200

    netfilter: remove invalid rcu_dereference() calls
    
    The CONFIG_PROVE_RCU option discovered a few invalid uses of
    rcu_dereference() in netfilter. In all these cases, the code code
    intends to check whether a pointer is already assigned when
    performing registration or whether the assigned pointer matches
    when performing unregistration. The entire registration/
    unregistration is protected by a mutex, so we don't need the
    rcu_dereference() calls.
    
    Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
    Tested-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index d5a9bcd..849614a 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -81,11 +81,9 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
 {
 	int ret = 0;
-	struct nf_ct_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_conntrack_event_cb);
-	if (notify != NULL) {
+	if (nf_conntrack_event_cb != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -101,11 +99,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
 
 void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
 {
-	struct nf_ct_event_notifier *notify;
-
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_conntrack_event_cb);
-	BUG_ON(notify != new);
+	BUG_ON(nf_conntrack_event_cb != new);
 	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
@@ -114,11 +109,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
 {
 	int ret = 0;
-	struct nf_exp_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_expect_event_cb);
-	if (notify != NULL) {
+	if (nf_expect_event_cb != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -134,11 +127,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
 
 void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
 {
-	struct nf_exp_event_notifier *notify;
-
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_expect_event_cb);
-	BUG_ON(notify != new);
+	BUG_ON(nf_expect_event_cb != new);
 	rcu_assign_pointer(nf_expect_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 015725a..908f599 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -35,7 +35,6 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
 /* return EEXIST if the same logger is registred, 0 on success. */
 int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 {
-	const struct nf_logger *llog;
 	int i;
 
 	if (pf >= ARRAY_SIZE(nf_loggers))
@@ -52,8 +51,7 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 	} else {
 		/* register at end of list to honor first register win */
 		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
-		llog = rcu_dereference(nf_loggers[pf]);
-		if (llog == NULL)
+		if (nf_loggers[pf] == NULL)
 			rcu_assign_pointer(nf_loggers[pf], logger);
 	}
 
@@ -65,13 +63,11 @@ EXPORT_SYMBOL(nf_log_register);
 
 void nf_log_unregister(struct nf_logger *logger)
 {
-	const struct nf_logger *c_logger;
 	int i;
 
 	mutex_lock(&nf_log_mutex);
 	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
-		c_logger = rcu_dereference(nf_loggers[i]);
-		if (c_logger == logger)
+		if (nf_loggers[i] == logger)
 			rcu_assign_pointer(nf_loggers[i], NULL);
 		list_del(&logger->list[i]);
 	}

^ permalink raw reply related

* Re: mmotm 2010-04-28 - RCU whinges
From: Eric Dumazet @ 2010-05-10 15:56 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <4BE8290A.2080707@trash.net>

Le lundi 10 mai 2010 à 17:40 +0200, Patrick McHardy a écrit :
> David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Mon, 03 May 2010 07:43:56 +0200
> > 
> >> Le lundi 03 mai 2010 à 07:41 +0200, Eric Dumazet a écrit :
> >>
> >>> Oops scratch that, I'll resend a correct version.
> >>>
> >>>
> >> Sorry, patch _is_ fine, I had one brain collapse when re-reading it, I
> >> thought a different mutex was in use in one of the functions.
> > 
> > Ok, Patrick please review, thanks.
> 
> Actually we don't need the rcu_dereference() calls at all since
> registration and unregistration are protected by the mutexes.
> 
> I queued this patch in nf-next, the only reason why I haven't
> submitted it yet is that I was unable to get git to cleanly export
> only the proper set of patches meant for -next due to a few merges,
> it insists on including 5 patches already merged upstream. If you
> don't mind ignoring the first 5 patches in the series, I'll send a
> pull request tonight.
> 


This will clash with upcoming RCU patches, where rcu protected pointer
cannot be directly accessed without lockdep splats.

We will need one day or another a rcu_...(nf_conntrack_event_cb)

> 
> pièce jointe document texte brut (x)
> commit ed86308f6179d8fa6151c2d0f652aad0091548e2
> Author: Patrick McHardy <kaber@trash.net>
> Date:   Fri Apr 9 16:42:15 2010 +0200
> 
>     netfilter: remove invalid rcu_dereference() calls
>     
>     The CONFIG_PROVE_RCU option discovered a few invalid uses of
>     rcu_dereference() in netfilter. In all these cases, the code code
>     intends to check whether a pointer is already assigned when
>     performing registration or whether the assigned pointer matches
>     when performing unregistration. The entire registration/
>     unregistration is protected by a mutex, so we don't need the
>     rcu_dereference() calls.
>     
>     Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
>     Tested-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
>     Signed-off-by: Patrick McHardy <kaber@trash.net>
> 
> diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
> index d5a9bcd..849614a 100644
> --- a/net/netfilter/nf_conntrack_ecache.c
> +++ b/net/netfilter/nf_conntrack_ecache.c
> @@ -81,11 +81,9 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
>  int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
>  {
>  	int ret = 0;
> -	struct nf_ct_event_notifier *notify;
>  
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_conntrack_event_cb);
> -	if (notify != NULL) {
> +	if (nf_conntrack_event_cb != NULL) {
>  		ret = -EBUSY;
>  		goto out_unlock;
>  	}
> @@ -101,11 +99,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
>  
>  void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
>  {
> -	struct nf_ct_event_notifier *notify;
> -
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_conntrack_event_cb);
> -	BUG_ON(notify != new);
> +	BUG_ON(nf_conntrack_event_cb != new);
>  	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
>  	mutex_unlock(&nf_ct_ecache_mutex);
>  }
> @@ -114,11 +109,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
>  int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
>  {
>  	int ret = 0;
> -	struct nf_exp_event_notifier *notify;
>  
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_expect_event_cb);
> -	if (notify != NULL) {
> +	if (nf_expect_event_cb != NULL) {
>  		ret = -EBUSY;
>  		goto out_unlock;
>  	}
> @@ -134,11 +127,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
>  
>  void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
>  {
> -	struct nf_exp_event_notifier *notify;
> -
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_expect_event_cb);
> -	BUG_ON(notify != new);
> +	BUG_ON(nf_expect_event_cb != new);
>  	rcu_assign_pointer(nf_expect_event_cb, NULL);
>  	mutex_unlock(&nf_ct_ecache_mutex);
>  }
> diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
> index 015725a..908f599 100644
> --- a/net/netfilter/nf_log.c
> +++ b/net/netfilter/nf_log.c
> @@ -35,7 +35,6 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
>  /* return EEXIST if the same logger is registred, 0 on success. */
>  int nf_log_register(u_int8_t pf, struct nf_logger *logger)
>  {
> -	const struct nf_logger *llog;
>  	int i;
>  
>  	if (pf >= ARRAY_SIZE(nf_loggers))
> @@ -52,8 +51,7 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
>  	} else {
>  		/* register at end of list to honor first register win */
>  		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
> -		llog = rcu_dereference(nf_loggers[pf]);
> -		if (llog == NULL)
> +		if (nf_loggers[pf] == NULL)
>  			rcu_assign_pointer(nf_loggers[pf], logger);
>  	}
>  
> @@ -65,13 +63,11 @@ EXPORT_SYMBOL(nf_log_register);
>  
>  void nf_log_unregister(struct nf_logger *logger)
>  {
> -	const struct nf_logger *c_logger;
>  	int i;
>  
>  	mutex_lock(&nf_log_mutex);
>  	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
> -		c_logger = rcu_dereference(nf_loggers[i]);
> -		if (c_logger == logger)
> +		if (nf_loggers[i] == logger)
>  			rcu_assign_pointer(nf_loggers[i], NULL);
>  		list_del(&logger->list[i]);
>  	}

^ permalink raw reply

* Re: mmotm 2010-04-28 - RCU whinges
From: Eric Dumazet @ 2010-05-10 15:57 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <1273506968.2221.19.camel@edumazet-laptop>

Le lundi 10 mai 2010 à 17:56 +0200, Eric Dumazet a écrit :
> Le lundi 10 mai 2010 à 17:40 +0200, Patrick McHardy a écrit :
> > David Miller wrote:
> > > From: Eric Dumazet <eric.dumazet@gmail.com>
> > > Date: Mon, 03 May 2010 07:43:56 +0200
> > > 
> > >> Le lundi 03 mai 2010 à 07:41 +0200, Eric Dumazet a écrit :
> > >>
> > >>> Oops scratch that, I'll resend a correct version.
> > >>>
> > >>>
> > >> Sorry, patch _is_ fine, I had one brain collapse when re-reading it, I
> > >> thought a different mutex was in use in one of the functions.
> > > 
> > > Ok, Patrick please review, thanks.
> > 
> > Actually we don't need the rcu_dereference() calls at all since
> > registration and unregistration are protected by the mutexes.
> > 
> > I queued this patch in nf-next, the only reason why I haven't
> > submitted it yet is that I was unable to get git to cleanly export
> > only the proper set of patches meant for -next due to a few merges,
> > it insists on including 5 patches already merged upstream. If you
> > don't mind ignoring the first 5 patches in the series, I'll send a
> > pull request tonight.
> > 
> 
> 
> This will clash with upcoming RCU patches, where rcu protected pointer
> cannot be directly accessed without lockdep splats.
> 

Sorry, I meant sparse here, not lockdep.

> We will need one day or another a rcu_...(nf_conntrack_event_cb)

^ permalink raw reply

* Re: mmotm 2010-04-28 - RCU whinges
From: Paul E. McKenney @ 2010-05-10 15:57 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, eric.dumazet, Valdis.Kletnieks, akpm, peterz,
	linux-kernel, netfilter-devel, netdev
In-Reply-To: <4BE8290A.2080707@trash.net>

On Mon, May 10, 2010 at 05:40:58PM +0200, Patrick McHardy wrote:
> David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Mon, 03 May 2010 07:43:56 +0200
> > 
> >> Le lundi 03 mai 2010 à 07:41 +0200, Eric Dumazet a écrit :
> >>
> >>> Oops scratch that, I'll resend a correct version.
> >>>
> >>>
> >> Sorry, patch _is_ fine, I had one brain collapse when re-reading it, I
> >> thought a different mutex was in use in one of the functions.
> > 
> > Ok, Patrick please review, thanks.
> 
> Actually we don't need the rcu_dereference() calls at all since
> registration and unregistration are protected by the mutexes.

The best approach in that case is rcu_dereference_protected() listing
the lock that must be held.  Of course, your code, so your choice.

						Thanx, Paul

> I queued this patch in nf-next, the only reason why I haven't
> submitted it yet is that I was unable to get git to cleanly export
> only the proper set of patches meant for -next due to a few merges,
> it insists on including 5 patches already merged upstream. If you
> don't mind ignoring the first 5 patches in the series, I'll send a
> pull request tonight.
> 
> 

> commit ed86308f6179d8fa6151c2d0f652aad0091548e2
> Author: Patrick McHardy <kaber@trash.net>
> Date:   Fri Apr 9 16:42:15 2010 +0200
> 
>     netfilter: remove invalid rcu_dereference() calls
>     
>     The CONFIG_PROVE_RCU option discovered a few invalid uses of
>     rcu_dereference() in netfilter. In all these cases, the code code
>     intends to check whether a pointer is already assigned when
>     performing registration or whether the assigned pointer matches
>     when performing unregistration. The entire registration/
>     unregistration is protected by a mutex, so we don't need the
>     rcu_dereference() calls.
>     
>     Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
>     Tested-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
>     Signed-off-by: Patrick McHardy <kaber@trash.net>
> 
> diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
> index d5a9bcd..849614a 100644
> --- a/net/netfilter/nf_conntrack_ecache.c
> +++ b/net/netfilter/nf_conntrack_ecache.c
> @@ -81,11 +81,9 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
>  int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
>  {
>  	int ret = 0;
> -	struct nf_ct_event_notifier *notify;
> 
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_conntrack_event_cb);
> -	if (notify != NULL) {
> +	if (nf_conntrack_event_cb != NULL) {
>  		ret = -EBUSY;
>  		goto out_unlock;
>  	}
> @@ -101,11 +99,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
> 
>  void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
>  {
> -	struct nf_ct_event_notifier *notify;
> -
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_conntrack_event_cb);
> -	BUG_ON(notify != new);
> +	BUG_ON(nf_conntrack_event_cb != new);
>  	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
>  	mutex_unlock(&nf_ct_ecache_mutex);
>  }
> @@ -114,11 +109,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
>  int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
>  {
>  	int ret = 0;
> -	struct nf_exp_event_notifier *notify;
> 
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_expect_event_cb);
> -	if (notify != NULL) {
> +	if (nf_expect_event_cb != NULL) {
>  		ret = -EBUSY;
>  		goto out_unlock;
>  	}
> @@ -134,11 +127,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
> 
>  void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
>  {
> -	struct nf_exp_event_notifier *notify;
> -
>  	mutex_lock(&nf_ct_ecache_mutex);
> -	notify = rcu_dereference(nf_expect_event_cb);
> -	BUG_ON(notify != new);
> +	BUG_ON(nf_expect_event_cb != new);
>  	rcu_assign_pointer(nf_expect_event_cb, NULL);
>  	mutex_unlock(&nf_ct_ecache_mutex);
>  }
> diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
> index 015725a..908f599 100644
> --- a/net/netfilter/nf_log.c
> +++ b/net/netfilter/nf_log.c
> @@ -35,7 +35,6 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
>  /* return EEXIST if the same logger is registred, 0 on success. */
>  int nf_log_register(u_int8_t pf, struct nf_logger *logger)
>  {
> -	const struct nf_logger *llog;
>  	int i;
> 
>  	if (pf >= ARRAY_SIZE(nf_loggers))
> @@ -52,8 +51,7 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
>  	} else {
>  		/* register at end of list to honor first register win */
>  		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
> -		llog = rcu_dereference(nf_loggers[pf]);
> -		if (llog == NULL)
> +		if (nf_loggers[pf] == NULL)
>  			rcu_assign_pointer(nf_loggers[pf], logger);
>  	}
> 
> @@ -65,13 +63,11 @@ EXPORT_SYMBOL(nf_log_register);
> 
>  void nf_log_unregister(struct nf_logger *logger)
>  {
> -	const struct nf_logger *c_logger;
>  	int i;
> 
>  	mutex_lock(&nf_log_mutex);
>  	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
> -		c_logger = rcu_dereference(nf_loggers[i]);
> -		if (c_logger == logger)
> +		if (nf_loggers[i] == logger)
>  			rcu_assign_pointer(nf_loggers[i], NULL);
>  		list_del(&logger->list[i]);
>  	}


^ permalink raw reply

* Re: [RFC] net: change bridge/macvlan hook to be be generic
From: Patrick McHardy @ 2010-05-10 15:58 UTC (permalink / raw)
  To: Scott Feldman; +Cc: Stephen Hemminger, David Miller, netdev
In-Reply-To: <C80610E0.2E2EC%scofeldm@cisco.com>

Scott Feldman wrote:
> On 5/4/10 3:37 PM, "Stephen Hemminger" <shemminger@vyatta.com> wrote:
> 
>> The existing macvlan and bridge have special hooks in the packet input
>> path. This patch changes it to a generic hook chain, like the packet type
>> processing. I have been wanting to look into flow based switching, etc...
> 
> Can this be further simplified by saying that a netdev can only be hooked by
> one mux (macvlan, bridge, etc) at any given time, so there is never more
> than one element in the hook chain.  If so, then we just need a single hook,
> not a chain.  
> 
> It seems odd to me that a dev would have both macvlan_port != NULL and
> br_port != NULL.  Can dev be in a macvlan and a bridge at the same time?

Yes, its possible to use the ebtables broute table to have packets
selectively delivered upwards in the stack instead of briding them.
"upwards" could also mean to a macvlan device.

I don't know if anyone is actually doing this, but its a configuration
which currently should work.

^ permalink raw reply

* Re: mmotm 2010-04-28 - RCU whinges
From: Patrick McHardy @ 2010-05-10 16:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <1273506968.2221.19.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le lundi 10 mai 2010 à 17:40 +0200, Patrick McHardy a écrit :
>> David Miller wrote:
>>> From: Eric Dumazet <eric.dumazet@gmail.com>
>>> Date: Mon, 03 May 2010 07:43:56 +0200
>>>
>>>> Le lundi 03 mai 2010 à 07:41 +0200, Eric Dumazet a écrit :
>>>>
>>>>> Oops scratch that, I'll resend a correct version.
>>>>>
>>>>>
>>>> Sorry, patch _is_ fine, I had one brain collapse when re-reading it, I
>>>> thought a different mutex was in use in one of the functions.
>>> Ok, Patrick please review, thanks.
>> Actually we don't need the rcu_dereference() calls at all since
>> registration and unregistration are protected by the mutexes.
>>
>> I queued this patch in nf-next, the only reason why I haven't
>> submitted it yet is that I was unable to get git to cleanly export
>> only the proper set of patches meant for -next due to a few merges,
>> it insists on including 5 patches already merged upstream. If you
>> don't mind ignoring the first 5 patches in the series, I'll send a
>> pull request tonight.
>>
> 
> This will clash with upcoming RCU patches, where rcu protected pointer
> cannot be directly accessed without lockdep splats.
> 
> We will need one day or another a rcu_...(nf_conntrack_event_cb)

Thanks for the information, I didn't realize that when looking at
those patches. So I guess the correct fix once those patches are
merged would be to use rcu_assign_protected() and rcu_access_pointer().
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: mmotm 2010-04-28 - RCU whinges
From: Patrick McHardy @ 2010-05-10 16:04 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <4BE82E39.6080603@trash.net>

Patrick McHardy wrote:
> Eric Dumazet wrote:
>> Le lundi 10 mai 2010 à 17:40 +0200, Patrick McHardy a écrit :
>>> David Miller wrote:
>>>> From: Eric Dumazet <eric.dumazet@gmail.com>
>>>> Date: Mon, 03 May 2010 07:43:56 +0200
>>>>
>>>>> Le lundi 03 mai 2010 à 07:41 +0200, Eric Dumazet a écrit :
>>>>>
>>>>>> Oops scratch that, I'll resend a correct version.
>>>>>>
>>>>>>
>>>>> Sorry, patch _is_ fine, I had one brain collapse when re-reading it, I
>>>>> thought a different mutex was in use in one of the functions.
>>>> Ok, Patrick please review, thanks.
>>> Actually we don't need the rcu_dereference() calls at all since
>>> registration and unregistration are protected by the mutexes.
>>>
>>> I queued this patch in nf-next, the only reason why I haven't
>>> submitted it yet is that I was unable to get git to cleanly export
>>> only the proper set of patches meant for -next due to a few merges,
>>> it insists on including 5 patches already merged upstream. If you
>>> don't mind ignoring the first 5 patches in the series, I'll send a
>>> pull request tonight.
>>>
>> This will clash with upcoming RCU patches, where rcu protected pointer
>> cannot be directly accessed without lockdep splats.
>>
>> We will need one day or another a rcu_...(nf_conntrack_event_cb)
> 
> Thanks for the information, I didn't realize that when looking at
> those patches. So I guess the correct fix once those patches are
> merged would be to use rcu_assign_protected() and rcu_access_pointer().

Ah, and that's what you did. Sorry for the confusion :)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] drivers/net/usb/asix.c Fix unaligned access
From: Neil Jones @ 2010-05-10 16:06 UTC (permalink / raw)
  To: netdev
In-Reply-To: <s2q91f916b91005060413pe1a1799etdea5ad730841a04e@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3887 bytes --]

From b277dbc256de7b1a8c47ca374914c097ff4cdd50 Mon Sep 17 00:00:00 2001
From: Neil Jones <NeilJay@gmail.com>
Date: Thu, 6 May 2010 11:20:53 +0100
Subject: [PATCH] drivers/net/usb/asix.c:        Fix unaligned accesses

Using this driver can cause unaligned accesses in the IP layer
This has been fixed by aligning the skb data correctly using the
spare room left over by the 4 byte header inserted between packets
by the device.

Signed-off-by: Neil Jones <NeilJay@gmail.com>
---
 drivers/net/usb/asix.c |   37 ++++++++++++++++++++++++++++++++++++-
 1 files changed, 36 insertions(+), 1 deletions(-)

diff --git a/drivers/net/usb/asix.c b/drivers/net/usb/asix.c
index a516185..5b4f0df 100644
--- a/drivers/net/usb/asix.c
+++ b/drivers/net/usb/asix.c
@@ -319,16 +319,51 @@ static int asix_rx_fixup(struct usbnet *dev,
struct sk_buff *skb)
               /* get the packet length */
               size = (u16) (header & 0x0000ffff);

-               if ((skb->len) - ((size + 1) & 0xfffe) == 0)
+               if ((skb->len) - ((size + 1) & 0xfffe) == 0) {
+                       u8 alignment = (u32)skb->data & 0x3;
+                       if (alignment != 0x2) {
+                               /*
+                                * not 16bit aligned so use the room provided by
+                                * the 32 bit header to align the data
+                                *
+                                * note we want 16bit alignment as MAC header is
+                                * 14bytes thus ip header will be aligned on
+                                * 32bit boundary so accessing ipheader elements
+                                * using a cast to struct ip header wont cause
+                                * an unaligned accesses.
+                                */
+                               u8 realignment = (alignment + 2) & 0x3;
+                               memmove(skb->data - realignment,
+                                       skb->data,
+                                       size);
+                               skb->data -= realignment;
+                               skb_set_tail_pointer(skb, size);
+                       }
                       return 2;
+               }
+
+
               if (size > ETH_FRAME_LEN) {
                       deverr(dev,"asix_rx_fixup() Bad RX Length %d", size);
                       return 0;
               }
               ax_skb = skb_clone(skb, GFP_ATOMIC);
               if (ax_skb) {
+                       u8 alignment = (u32)packet & 0x3;
                       ax_skb->len = size;
+
+                       if (alignment != 0x2) {
+                               /*
+                                * not 16bit aligned use the room provided by
+                                * the 32 bit header to align the data
+                                */
+                               u8 realignment = (alignment + 2) & 0x3;
+                               memmove(packet - realignment, packet, size);
+                               packet -= realignment;
+                       }
                       ax_skb->data = packet;
+
+
                       skb_set_tail_pointer(ax_skb, size);
                       usbnet_skb_return(dev, ax_skb);
               } else {
--
1.5.5.2

[-- Attachment #2: 0001-drivers-net-usb-asix.c-Fix-unaligned-accesses.patch --]
[-- Type: application/octet-stream, Size: 2284 bytes --]

From b277dbc256de7b1a8c47ca374914c097ff4cdd50 Mon Sep 17 00:00:00 2001
From: Neil Jones <njones@lofty.le.imgtec.org>
Date: Thu, 6 May 2010 11:20:53 +0100
Subject: [PATCH] drivers/net/usb/asix.c:	Fix unaligned accesses

Using this driver can cause unaligned accesses in the IP layer
This has been fixed by aligning the skb data correctly using the
spare room left over by the 4 byte header inserted between packets
by the device.

Signed-off-by: Neil Jones <NeilJay@gmail.com>
---
 drivers/net/usb/asix.c |   37 ++++++++++++++++++++++++++++++++++++-
 1 files changed, 36 insertions(+), 1 deletions(-)

diff --git a/drivers/net/usb/asix.c b/drivers/net/usb/asix.c
index a516185..5b4f0df 100644
--- a/drivers/net/usb/asix.c
+++ b/drivers/net/usb/asix.c
@@ -319,16 +319,51 @@ static int asix_rx_fixup(struct usbnet *dev, struct sk_buff *skb)
 		/* get the packet length */
 		size = (u16) (header & 0x0000ffff);
 
-		if ((skb->len) - ((size + 1) & 0xfffe) == 0)
+		if ((skb->len) - ((size + 1) & 0xfffe) == 0) {
+			u8 alignment = (u32)skb->data & 0x3;
+			if (alignment != 0x2) {
+				/*
+				 * not 16bit aligned so use the room provided by
+				 * the 32 bit header to align the data
+				 *
+				 * note we want 16bit alignment as MAC header is
+				 * 14bytes thus ip header will be aligned on
+				 * 32bit boundary so accessing ipheader elements
+				 * using a cast to struct ip header wont cause
+				 * an unaligned accesses.
+				 */
+				u8 realignment = (alignment + 2) & 0x3;
+				memmove(skb->data - realignment,
+					skb->data,
+					size);
+				skb->data -= realignment;
+				skb_set_tail_pointer(skb, size);
+			}
 			return 2;
+		}
+
+
 		if (size > ETH_FRAME_LEN) {
 			deverr(dev,"asix_rx_fixup() Bad RX Length %d", size);
 			return 0;
 		}
 		ax_skb = skb_clone(skb, GFP_ATOMIC);
 		if (ax_skb) {
+			u8 alignment = (u32)packet & 0x3;
 			ax_skb->len = size;
+
+			if (alignment != 0x2) {
+				/*
+				 * not 16bit aligned use the room provided by
+				 * the 32 bit header to align the data
+				 */
+				u8 realignment = (alignment + 2) & 0x3;
+				memmove(packet - realignment, packet, size);
+				packet -= realignment;
+			}
 			ax_skb->data = packet;
+
+
 			skb_set_tail_pointer(ax_skb, size);
 			usbnet_skb_return(dev, ax_skb);
 		} else {
-- 
1.5.5.2


^ permalink raw reply related

* Re: 2.6.34-rc6-git6: Reported regressions from 2.6.33
From: Nick Bowler @ 2010-05-10 16:07 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: DRI, Linux SCSI List, Network Development, Linux Wireless List,
	Linux Kernel Mailing List, Linux ACPI, Andrew Morton,
	Kernel Testers List, Linus Torvalds, Linux PM List,
	Maciej Rutecki
In-Reply-To: <10aqDnTJcLO.A.mLE.48y5LB@chimera>

On 23:13 Sun 09 May     , Rafael J. Wysocki wrote:
> This message contains a list of some regressions from 2.6.33,
> for which there are no fixes in the mainline known to the tracking team.
> If any of them have been fixed already, please let us know.
> 
> If you know of any other unresolved regressions from 2.6.33, please let us
> know either and we'll add them to the list.  Also, please let us know
> if any of the entries below are invalid.

Seems that

  r600 CS checker rejects narrow FBO renderbuffers
  https://bugs.freedesktop.org/show_bug.cgi?id=27609

never got added to the list.  It is still an issue as of 2.6.34-rc7.

-- 
Nick Bowler, Elliptic Technologies (http://www.elliptictech.com/)

^ permalink raw reply

* Re: [PATCH] virtif: initial interface extensions
From: Stefan Berger @ 2010-05-10 15:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <201005090120.19958.arnd@arndb.de>

Arnd Bergmann <arnd <at> arndb.de> writes:

[...]

> +	if (tb[IFLA_VIRTIF]) {
> +		struct ifla_virtif_port_profile *ivp;
> +		struct nlattr *virtif[IFLA_VIRTIF_MAX+1];
> +		u32 vf;
> +
> +		err = nla_parse_nested(virtif, IFLA_VIRTIF_MAX,
> +				       tb[IFLA_VIRTIF], ifla_virtif_policy);
> +		if (err < 0)
> +			return err;
> +
> +		if (!virtif[IFLA_VIRTIF_VF] || !virtif[IFLA_VIRTIF_PORT_PROFILE])
> +			goto novirtif; /* IFLA_VIRTIF may be directed at user space */

In what case would the IFLA_VIRTIF_PORT_PROFILE be provided? Would libvirt for
example need to be aware of whether the Ethernet device can handle the setup
protocol via its firmware and in this case provide the port profile parameter
and in other cases provide other parameters? Certainly the user or upper layer
management software would have to know it when creating the domain XML and in
fact different types of parameters were needed. Obviously we should have one
common set of (XML) parameters that go into the netlink message and that can be
handled by the kernel driver if the firmware knows how to handle it or by
LLDPAD. Libvirt would send the parameters via netlink message to trigger the
setup protocol and the message may be received by kernel and LLDPAD. From what I
can see LLDPAD also may need a way to probe the kernel driver whether it handled
the setup protocol via firmware on a given interface, which may or may not be
true for all interfaces, but may be necessary to avoid triggering the setup
protocol twice.

   Stefan

^ permalink raw reply

* Re: mmotm 2010-04-28 - RCU whinges
From: David Miller @ 2010-05-10 16:12 UTC (permalink / raw)
  To: kaber
  Cc: eric.dumazet, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <4BE8290A.2080707@trash.net>

From: Patrick McHardy <kaber@trash.net>
Date: Mon, 10 May 2010 17:40:58 +0200

> I queued this patch in nf-next, the only reason why I haven't
> submitted it yet is that I was unable to get git to cleanly export
> only the proper set of patches meant for -next due to a few merges,
> it insists on including 5 patches already merged upstream. If you
> don't mind ignoring the first 5 patches in the series, I'll send a
> pull request tonight.

Something like "git format-patch origin" doesn't avoid those upstream
commits?  Weird...

Another trick is to specify a commit range using triple-dot "..."
notation, such as "origin...master"

^ permalink raw reply

* Re: mmotm 2010-04-28 - RCU whinges
From: Patrick McHardy @ 2010-05-10 16:27 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, Valdis.Kletnieks, akpm, peterz, linux-kernel,
	netfilter-devel, netdev, paulmck
In-Reply-To: <20100510.091257.51273494.davem@davemloft.net>

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Mon, 10 May 2010 17:40:58 +0200
> 
>> I queued this patch in nf-next, the only reason why I haven't
>> submitted it yet is that I was unable to get git to cleanly export
>> only the proper set of patches meant for -next due to a few merges,
>> it insists on including 5 patches already merged upstream. If you
>> don't mind ignoring the first 5 patches in the series, I'll send a
>> pull request tonight.
> 
> Something like "git format-patch origin" doesn't avoid those upstream
> commits?  Weird...

Yeah, it seems to have something to do with me merging the nf-2.6.git
tree a few weeks ago since it had patches queued that were too late
for 2.6.34. Even the --ignore-if-in-upstream option doesn't help.

> Another trick is to specify a commit range using triple-dot "..."
> notation, such as "origin...master"

Thanks, I'll give it another try, the alternative is manually
renumbering the entire patchset.

^ permalink raw reply

* Re: [PATCHv7] add mergeable buffers support to vhost_net
From: Michael S. Tsirkin @ 2010-05-10 16:43 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization
In-Reply-To: <1272488232.11307.4.camel@w-dls.beaverton.ibm.com>

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
>  	use_mm(net->dev.mm);
>  	mutex_lock(&vq->mutex);
>  	vhost_disable_notify(vq);
> -	hdr_size = vq->hdr_size;
> +	vhost_hlen = vq->vhost_hlen;
>  
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(vq, sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads,
> +					     datalen + vhost_hlen,
> +					     &in, vq_log, &log);
> +		if (headcount < 0)
> +			break;
>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> +		if (!headcount) {
>  			if (unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */
> @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
>  			break;
>  		}
>  		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> +		if (vhost_hlen)
> +			/* Skip header. TODO: support TSO. */
> +			s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> +		else
> +			s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for RX: "
>  			       "%zd expected %zd\n",
> -			       iov_length(vq->hdr, s), hdr_size);
> +			       iov_length(vq->hdr, s), vhost_hlen);
>  			break;
>  		}
>  		err = sock->ops->recvmsg(NULL, sock, &msg,
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			break;
>  		}
> -		/* TODO: Should check and handle checksum. */
> -		if (err > len) {
> -			pr_err("Discarded truncated rx packet: "
> -			       " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq);
> +		if (err != datalen) {
> +			pr_err("Discarded rx packet: "
> +			       " len %d, expected %zd\n", err, datalen);
> +			vhost_discard_desc(vq, headcount);
>  			continue;
>  		}
>  		len = err;
> -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> -		if (err) {
> -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> -			       vq->iov->iov_base, err);
> +		if (vhost_hlen &&
> +		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
> +				      vhost_hlen)) {
> +			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
> +			       vq->iov->iov_base);
>  			break;
>  		}
> -		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> +		/* TODO: Should check and handle checksum. */
> +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> +		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
> +				      offsetof(typeof(hdr), num_buffers),
> +				      sizeof(hdr.num_buffers))) {
> +			vq_err(vq, "Failed num_buffers write");
> +			vhost_discard_desc(vq, headcount);
> +			break;
> +		}
> +		len += vhost_hlen;
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> +					    headcount);
>  		if (unlikely(vq_log))
>  			vhost_log_write(vq, vq_log, log, len);
>  		total_len += len;

OK I think I see the bug here: vhost_add_used_and_signal_n
does not get the actual length, it gets the iovec length from vhost.
Guest virtio uses this as packet length, with bad results.

So I have applied the follows and it seems to have fixed the problem:

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c16db02..9d7496d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -219,7 +219,7 @@ static int peek_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
- * @datalen	- data length we'll be reading
+ * @datalen	- data length we'll be reading. must be > 0
  * @iovcount	- returned count of io vectors we fill
  * @log		- vhost log
  * @log_num	- log offset
@@ -236,9 +236,10 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 	int seg = 0;
 	int headcount = 0;
 	unsigned d;
+	size_t len;
 	int r, nlogs = 0;
 
-	while (datalen > 0) {
+	for (;;) {
 		if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
 			r = -ENOBUFS;
 			goto err;
@@ -260,16 +261,20 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 			nlogs += *log_num;
 			log += *log_num;
 		}
+		len = iov_length(vq->iov + seg, in);
+		seg += in;
 		heads[headcount].id = d;
-		heads[headcount].len = iov_length(vq->iov + seg, in);
-		datalen -= heads[headcount].len;
+		if (datalen <= len)
+			break;
+		heads[headcount].len = len;
 		++headcount;
-		seg += in;
+		datalen -= len;
 	}
+	heads[headcount].len = datalen;
 	*iovcount = seg;
 	if (unlikely(log))
 		*log_num = nlogs;
-	return headcount;
+	return headcount + 1;
 err:
 	vhost_discard_desc(vq, headcount);
 	return r;

-- 
MST

^ permalink raw reply related

* Re: mmotm 2010-04-28 - RCU whinges
From: Patrick McHardy @ 2010-05-10 16:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Valdis.Kletnieks, Andrew Morton, Peter Zijlstra, David S. Miller,
	linux-kernel, netfilter-devel, netdev, Paul E. McKenney
In-Reply-To: <1272865137.2173.179.camel@edumazet-laptop>

[-- Attachment #1: Type: text/plain, Size: 2484 bytes --]

Eric Dumazet wrote:
> Le dimanche 02 mai 2010 à 13:46 -0400, Valdis.Kletnieks@vt.edu a écrit :
>> On Wed, 28 Apr 2010 16:53:32 PDT, akpm@linux-foundation.org said:
>>> The mm-of-the-moment snapshot 2010-04-28-16-53 has been uploaded to
>>>
>>>    http://userweb.kernel.org/~akpm/mmotm/
>> I thought we swatted all these, hit another one...
>>
>> [    9.131490] ctnetlink v0.93: registering with nfnetlink.
>> [    9.131535]
>> [    9.131535] ===================================================
>> [    9.131704] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [    9.131794] ---------------------------------------------------
>> [    9.131883] net/netfilter/nf_conntrack_ecache.c:88 invoked rcu_dereference_check() without protection!
>> [    9.131977]
>> [    9.131977] other info that might help us debug this:
>> [    9.131978]
>> [    9.132218]
>> [    9.132219] rcu_scheduler_active = 1, debug_locks = 0
>> [    9.132434] 1 lock held by swapper/1:
>> [    9.132519]  #0:  (nf_ct_ecache_mutex){+.+...}, at: [<ffffffff8148922d>] nf_conntrack_register_notifier+0x1a/0x75
>> [    9.132938]
>> [    9.132939] stack backtrace:
>> [    9.133129] Pid: 1, comm: swapper Tainted: G        W   2.6.34-rc5-mmotm0428 #1
>> [    9.133220] Call Trace:
>> [    9.133319]  [<ffffffff81064832>] lockdep_rcu_dereference+0xaa/0xb2
>> [    9.133410]  [<ffffffff81489250>] nf_conntrack_register_notifier+0x3d/0x75
>> [    9.133521]  [<ffffffff81b5a157>] ctnetlink_init+0x71/0xd5
>> [    9.133627]  [<ffffffff81b5a0e6>] ? ctnetlink_init+0x0/0xd5
>> [    9.133735]  [<ffffffff810001ef>] do_one_initcall+0x59/0x14e
>> [    9.133843]  [<ffffffff81b2e68a>] kernel_init+0x144/0x1ce
>> [    9.133949]  [<ffffffff81003414>] kernel_thread_helper+0x4/0x10
>> [    9.134060]  [<ffffffff81598a40>] ? restore_args+0x0/0x30
>> [    9.134196]  [<ffffffff81b2e546>] ? kernel_init+0x0/0x1ce
>> [    9.134328]  [<ffffffff81003410>] ? kernel_thread_helper+0x0/0x10
>> [    9.134530] ip_tables: (C) 2000-2006 Netfilter Core Team
>> [    9.134655] TCP bic registered
>>
> 
> Thanks for the report !
> 
> We can use rcu_dereference_protected() in those cases.
> 
> [PATCH] net: Use rcu_dereference_protected in nf_conntrack_ecache
> 
> Writers own nf_ct_ecache_mutex.

I've committed this patch to my tree, which also fixes up the nf_log
changes I already had queued.

I've also figured out how to prevent the false commits from showing
up using the '^' notation, I'll submit everything after some final
testing.



[-- Attachment #2: x --]
[-- Type: text/plain, Size: 3904 bytes --]

commit b56f2d55c6c22b0c5774b3b22e336fb6cc5f4094
Author: Patrick McHardy <kaber@trash.net>
Date:   Mon May 10 18:47:57 2010 +0200

    netfilter: use rcu_dereference_protected()
    
    Restore the rcu_dereference() calls in conntrack/expectation notifier
    and logger registration/unregistration, but use the _protected variant,
    which will be required by the upcoming __rcu annotations.
    
    Based on patch by Eric Dumazet <eric.dumazet@gmail.com>
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index a94ac3a..cdcc764 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -82,9 +82,12 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
 {
 	int ret = 0;
+	struct nf_ct_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	if (nf_conntrack_event_cb != NULL) {
+	notify = rcu_dereference_protected(nf_conntrack_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	if (notify != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -100,8 +103,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
 
 void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
 {
+	struct nf_ct_event_notifier *notify;
+
 	mutex_lock(&nf_ct_ecache_mutex);
-	BUG_ON(nf_conntrack_event_cb != new);
+	notify = rcu_dereference_protected(nf_conntrack_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	BUG_ON(notify != new);
 	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
@@ -110,9 +117,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
 {
 	int ret = 0;
+	struct nf_exp_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	if (nf_expect_event_cb != NULL) {
+	notify = rcu_dereference_protected(nf_expect_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	if (notify != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -128,8 +138,12 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
 
 void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
 {
+	struct nf_exp_event_notifier *notify;
+
 	mutex_lock(&nf_ct_ecache_mutex);
-	BUG_ON(nf_expect_event_cb != new);
+	notify = rcu_dereference_protected(nf_expect_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	BUG_ON(notify != new);
 	rcu_assign_pointer(nf_expect_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 908f599..7df37fd 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -35,6 +35,7 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
 /* return EEXIST if the same logger is registred, 0 on success. */
 int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 {
+	const struct nf_logger *llog;
 	int i;
 
 	if (pf >= ARRAY_SIZE(nf_loggers))
@@ -51,7 +52,9 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 	} else {
 		/* register at end of list to honor first register win */
 		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
-		if (nf_loggers[pf] == NULL)
+		llog = rcu_dereference_protected(nf_loggers[pf],
+						 lockdep_is_held(&nf_log_mutex));
+		if (llog == NULL)
 			rcu_assign_pointer(nf_loggers[pf], logger);
 	}
 
@@ -63,11 +66,14 @@ EXPORT_SYMBOL(nf_log_register);
 
 void nf_log_unregister(struct nf_logger *logger)
 {
+	const struct nf_logger *c_logger;
 	int i;
 
 	mutex_lock(&nf_log_mutex);
 	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
-		if (nf_loggers[i] == logger)
+		c_logger = rcu_dereference_protected(nf_loggers[i],
+						     lockdep_is_held(&nf_log_mutex));
+		if (c_logger == logger)
 			rcu_assign_pointer(nf_loggers[i], NULL);
 		list_del(&logger->list[i]);
 	}

^ permalink raw reply related

* Re: [PATCHv7] add mergeable buffers support to vhost_net
From: David Stevens @ 2010-05-10 17:09 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, virtualization
In-Reply-To: <20100510164303.GA28798@redhat.com>

Since "datalen" carries the difference and will be negative by that amount
from the original loop, what about just adding something like:

        }
        if (headcount)
                heads[headcount-1].len += datalen;
[and really, headcount >0 since datalen > 0, so just:

        heads[headcount-1].len += datalen;

                                                +-DLS


kvm-owner@vger.kernel.org wrote on 05/10/2010 09:43:03 AM:

> On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> > @@ -218,18 +248,19 @@ static void handle_rx(struct vhost_net *
> >     use_mm(net->dev.mm);
> >     mutex_lock(&vq->mutex);
> >     vhost_disable_notify(vq);
> > -   hdr_size = vq->hdr_size;
> > +   vhost_hlen = vq->vhost_hlen;
> > 
> >     vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >        vq->log : NULL;
> > 
> > -   for (;;) {
> > -      head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -                ARRAY_SIZE(vq->iov),
> > -                &out, &in,
> > -                vq_log, &log);
> > +   while ((datalen = vhost_head_len(vq, sock->sk))) {
> > +      headcount = vhost_get_desc_n(vq, vq->heads,
> > +                    datalen + vhost_hlen,
> > +                    &in, vq_log, &log);
> > +      if (headcount < 0)
> > +         break;
> >        /* OK, now we need to know about added descriptors. */
> > -      if (head == vq->num) {
> > +      if (!headcount) {
> >           if (unlikely(vhost_enable_notify(vq))) {
> >              /* They have slipped one in as we were
> >               * doing that: check again. */
> > @@ -241,46 +272,53 @@ static void handle_rx(struct vhost_net *
> >           break;
> >        }
> >        /* We don't need to be notified again. */
> > -      if (out) {
> > -         vq_err(vq, "Unexpected descriptor format for RX: "
> > -                "out %d, int %d\n",
> > -                out, in);
> > -         break;
> > -      }
> > -      /* Skip header. TODO: support TSO/mergeable rx buffers. */
> > -      s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > +      if (vhost_hlen)
> > +         /* Skip header. TODO: support TSO. */
> > +         s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
> > +      else
> > +         s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
> >        msg.msg_iovlen = in;
> >        len = iov_length(vq->iov, in);
> >        /* Sanity check */
> >        if (!len) {
> >           vq_err(vq, "Unexpected header len for RX: "
> >                  "%zd expected %zd\n",
> > -                iov_length(vq->hdr, s), hdr_size);
> > +                iov_length(vq->hdr, s), vhost_hlen);
> >           break;
> >        }
> >        err = sock->ops->recvmsg(NULL, sock, &msg,
> >                  len, MSG_DONTWAIT | MSG_TRUNC);
> >        /* TODO: Check specific error and bomb out unless EAGAIN? */
> >        if (err < 0) {
> > -         vhost_discard_vq_desc(vq);
> > +         vhost_discard_desc(vq, headcount);
> >           break;
> >        }
> > -      /* TODO: Should check and handle checksum. */
> > -      if (err > len) {
> > -         pr_err("Discarded truncated rx packet: "
> > -                " len %d > %zd\n", err, len);
> > -         vhost_discard_vq_desc(vq);
> > +      if (err != datalen) {
> > +         pr_err("Discarded rx packet: "
> > +                " len %d, expected %zd\n", err, datalen);
> > +         vhost_discard_desc(vq, headcount);
> >           continue;
> >        }
> >        len = err;
> > -      err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
> > -      if (err) {
> > -         vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
> > -                vq->iov->iov_base, err);
> > +      if (vhost_hlen &&
> > +          memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
> > +                  vhost_hlen)) {
> > +         vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
> > +                vq->iov->iov_base);
> >           break;
> >        }
> > -      len += hdr_size;
> > -      vhost_add_used_and_signal(&net->dev, vq, head, len);
> > +      /* TODO: Should check and handle checksum. */
> > +      if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
> > +          memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
> > +                  offsetof(typeof(hdr), num_buffers),
> > +                  sizeof(hdr.num_buffers))) {
> > +         vq_err(vq, "Failed num_buffers write");
> > +         vhost_discard_desc(vq, headcount);
> > +         break;
> > +      }
> > +      len += vhost_hlen;
> > +      vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> > +                   headcount);
> >        if (unlikely(vq_log))
> >           vhost_log_write(vq, vq_log, log, len);
> >        total_len += len;
> 
> OK I think I see the bug here: vhost_add_used_and_signal_n
> does not get the actual length, it gets the iovec length from vhost.
> Guest virtio uses this as packet length, with bad results.
> 
> So I have applied the follows and it seems to have fixed the problem:
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c16db02..9d7496d 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -219,7 +219,7 @@ static int peek_head_len(struct vhost_virtqueue *vq, 

> struct sock *sk)
>  /* This is a multi-buffer version of vhost_get_desc, that works if
>   *   vq has read descriptors only.
>   * @vq      - the relevant virtqueue
> - * @datalen   - data length we'll be reading
> + * @datalen   - data length we'll be reading. must be > 0
>   * @iovcount   - returned count of io vectors we fill
>   * @log      - vhost log
>   * @log_num   - log offset
> @@ -236,9 +236,10 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>     int seg = 0;
>     int headcount = 0;
>     unsigned d;
> +   size_t len;
>     int r, nlogs = 0;
> 
> -   while (datalen > 0) {
> +   for (;;) {
>        if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
>           r = -ENOBUFS;
>           goto err;
> @@ -260,16 +261,20 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>           nlogs += *log_num;
>           log += *log_num;
>        }
> +      len = iov_length(vq->iov + seg, in);
> +      seg += in;
>        heads[headcount].id = d;
> -      heads[headcount].len = iov_length(vq->iov + seg, in);
> -      datalen -= heads[headcount].len;
> +      if (datalen <= len)
> +         break;
> +      heads[headcount].len = len;
>        ++headcount;
> -      seg += in;
> +      datalen -= len;
>     }
> +   heads[headcount].len = datalen;
>     *iovcount = seg;
>     if (unlikely(log))
>        *log_num = nlogs;
> -   return headcount;
> +   return headcount + 1;
>  err:
>     vhost_discard_desc(vq, headcount);
>     return r;
> 
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [RFC] net: change bridge/macvlan hook to be be generic
From: Ben Greear @ 2010-05-10 17:14 UTC (permalink / raw)
  To: Scott Feldman; +Cc: Stephen Hemminger, David Miller, Patrick McHardy, netdev
In-Reply-To: <C80610E0.2E2EC%scofeldm@cisco.com>

On 05/04/2010 05:58 PM, Scott Feldman wrote:
> On 5/4/10 3:37 PM, "Stephen Hemminger"<shemminger@vyatta.com>  wrote:
>
>> The existing macvlan and bridge have special hooks in the packet input
>> path. This patch changes it to a generic hook chain, like the packet type
>> processing. I have been wanting to look into flow based switching, etc...
>
> Can this be further simplified by saying that a netdev can only be hooked by
> one mux (macvlan, bridge, etc) at any given time, so there is never more
> than one element in the hook chain.  If so, then we just need a single hook,
> not a chain.
>
> It seems odd to me that a dev would have both macvlan_port != NULL and
> br_port != NULL.  Can dev be in a macvlan and a bridge at the same time?

If we did add the generic hook list, then we could support other hooks, like
a pktgen rx hook to gather latency, pkt-loss, and similar stats, for example.

Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox