Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 7/8] tg3: Fix some checkpatch errors
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

This patch fixes the following checkpatch errors:

ERROR: do not use assignment in if condition
+	if ((mss = skb_shinfo(skb)->gso_size) != 0) {

ERROR: do not use assignment in if condition
+	if ((mss = skb_shinfo(skb)->gso_size) != 0) {

ERROR: space prohibited after that '!' (ctx:BxW)
+			if (! netif_carrier_ok(tp->dev) &&
 			    ^

ERROR: space required after that ',' (ctx:VxV)
+#define GET_REG32_LOOP(base,len)		\
                            ^

ERROR: "(foo*)" should be "(foo *)"
+		memcpy(data, ((char*)&val) + b_offset, b_count);

ERROR: do not use assignment in if condition
+		if ((err = tg3_do_mem_test(tp, mem_tbl[i].offset,

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |   17 +++++++++--------
 1 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f61a4d8..7116a47 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -5574,8 +5574,8 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb,
 
 	entry = tnapi->tx_prod;
 	base_flags = 0;
-	mss = 0;
-	if ((mss = skb_shinfo(skb)->gso_size) != 0) {
+	mss = skb_shinfo(skb)->gso_size;
+	if (mss) {
 		int tcp_opt_len, ip_tcp_len;
 		u32 hdrlen;
 
@@ -5781,7 +5781,8 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		base_flags |= TXD_FLAG_TCPUDP_CSUM;
 
-	if ((mss = skb_shinfo(skb)->gso_size) != 0) {
+	mss = skb_shinfo(skb)->gso_size;
+	if (mss) {
 		struct iphdr *iph;
 		u32 tcp_opt_len, hdr_len;
 
@@ -8514,7 +8515,7 @@ static void tg3_timer(unsigned long __opaque)
 			    (mac_stat & MAC_STATUS_LNKSTATE_CHANGED)) {
 				need_setup = 1;
 			}
-			if (! netif_carrier_ok(tp->dev) &&
+			if (!netif_carrier_ok(tp->dev) &&
 			    (mac_stat & (MAC_STATUS_PCS_SYNCED |
 					 MAC_STATUS_SIGNAL_DET))) {
 				need_setup = 1;
@@ -9372,7 +9373,7 @@ static void tg3_get_regs(struct net_device *dev,
 	tg3_full_lock(tp, 0);
 
 #define __GET_REG32(reg)	(*(p)++ = tr32(reg))
-#define GET_REG32_LOOP(base,len)		\
+#define GET_REG32_LOOP(base, len)		\
 do {	p = (u32 *)(orig_p + (base));		\
 	for (i = 0; i < len; i += 4)		\
 		__GET_REG32((base) + i);	\
@@ -9465,7 +9466,7 @@ static int tg3_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
 		ret = tg3_nvram_read_be32(tp, offset-b_offset, &val);
 		if (ret)
 			return ret;
-		memcpy(data, ((char*)&val) + b_offset, b_count);
+		memcpy(data, ((char *)&val) + b_offset, b_count);
 		len -= b_count;
 		offset += b_count;
 		eeprom->len += b_count;
@@ -10585,8 +10586,8 @@ static int tg3_test_memory(struct tg3 *tp)
 		mem_tbl = mem_tbl_570x;
 
 	for (i = 0; mem_tbl[i].offset != 0xffffffff; i++) {
-		if ((err = tg3_do_mem_test(tp, mem_tbl[i].offset,
-		    mem_tbl[i].len)) != 0)
+		err = tg3_do_mem_test(tp, mem_tbl[i].offset, mem_tbl[i].len);
+		if (err)
 			break;
 	}
 
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 8/8] tg3: Update version to 3.112
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

This patch updates the tg3 version to 3.112.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 7116a47..5769e15 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -69,10 +69,10 @@
 
 #define DRV_MODULE_NAME		"tg3"
 #define TG3_MAJ_NUM			3
-#define TG3_MIN_NUM			111
+#define TG3_MIN_NUM			112
 #define DRV_MODULE_VERSION	\
 	__stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM)
-#define DRV_MODULE_RELDATE	"June 5, 2010"
+#define DRV_MODULE_RELDATE	"July 11, 2010"
 
 #define TG3_DEF_MAC_MODE	0
 #define TG3_DEF_RX_MODE		0
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 5/8] tg3: Report driver version to firmware
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

This patch changes the code so that the driver version can be reported
to the firmware in addition to the current use.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |    8 ++++++--
 drivers/net/tg3.h |    4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index c91049b..e6da16a 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -18,6 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/stringify.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
@@ -67,7 +68,10 @@
 #include "tg3.h"
 
 #define DRV_MODULE_NAME		"tg3"
-#define DRV_MODULE_VERSION	"3.111"
+#define TG3_MAJ_NUM			3
+#define TG3_MIN_NUM			111
+#define DRV_MODULE_VERSION	\
+	__stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM)
 #define DRV_MODULE_RELDATE	"June 5, 2010"
 
 #define TG3_DEF_MAC_MODE	0
@@ -6629,7 +6633,7 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int kind)
 		apedata = tg3_ape_read32(tp, TG3_APE_HOST_INIT_COUNT);
 		tg3_ape_write32(tp, TG3_APE_HOST_INIT_COUNT, ++apedata);
 		tg3_ape_write32(tp, TG3_APE_HOST_DRIVER_ID,
-				APE_HOST_DRIVER_ID_MAGIC);
+			APE_HOST_DRIVER_ID_MAGIC(TG3_MAJ_NUM, TG3_MIN_NUM));
 		tg3_ape_write32(tp, TG3_APE_HOST_BEHAVIOR,
 				APE_HOST_BEHAV_NO_PHYLOCK);
 
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index b72ac52..be7ab91 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2193,7 +2193,9 @@
 #define  APE_HOST_SEG_LEN_MAGIC		 0x0000001c
 #define TG3_APE_HOST_INIT_COUNT		0x4208
 #define TG3_APE_HOST_DRIVER_ID		0x420c
-#define  APE_HOST_DRIVER_ID_MAGIC	 0xf0035100
+#define  APE_HOST_DRIVER_ID_LINUX	 0xf0000000
+#define  APE_HOST_DRIVER_ID_MAGIC(maj, min)	\
+	(APE_HOST_DRIVER_ID_LINUX | (maj & 0xff) << 16 | (min & 0xff) << 8)
 #define TG3_APE_HOST_BEHAVIOR		0x4210
 #define  APE_HOST_BEHAV_NO_PHYLOCK	 0x00000001
 #define TG3_APE_HOST_HEARTBEAT_INT_MS	0x4214
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 6/8] tg3: Revert PCIe tx glitch fix
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

This patch reverts commit 52cdf8526fe24f11d300b75458ddee017f3f4c88,
entitled "tg3: Prevent a PCIe tx glitch".  The problem does not have
any visible side-effects and happens too early for the driver to do
anything about it.  The proper place for this code is within the
device's bootcode.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |   24 ------------------------
 drivers/net/tg3.h |   22 ----------------------
 2 files changed, 0 insertions(+), 46 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index e6da16a..f61a4d8 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -7072,30 +7072,6 @@ static int tg3_chip_reset(struct tg3 *tp)
 
 	tg3_mdio_start(tp);
 
-	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_57780) {
-		u8 phy_addr;
-
-		phy_addr = tp->phy_addr;
-		tp->phy_addr = TG3_PHY_PCIE_ADDR;
-
-		tg3_writephy(tp, TG3_PCIEPHY_BLOCK_ADDR,
-			     TG3_PCIEPHY_TXB_BLK << TG3_PCIEPHY_BLOCK_SHIFT);
-		val = TG3_PCIEPHY_TX0CTRL1_TXOCM | TG3_PCIEPHY_TX0CTRL1_RDCTL |
-		      TG3_PCIEPHY_TX0CTRL1_TXCMV | TG3_PCIEPHY_TX0CTRL1_TKSEL |
-		      TG3_PCIEPHY_TX0CTRL1_NB_EN;
-		tg3_writephy(tp, TG3_PCIEPHY_TX0CTRL1, val);
-		udelay(10);
-
-		tg3_writephy(tp, TG3_PCIEPHY_BLOCK_ADDR,
-			     TG3_PCIEPHY_XGXS_BLK1 << TG3_PCIEPHY_BLOCK_SHIFT);
-		val = TG3_PCIEPHY_PWRMGMT4_LOWPWR_EN |
-		      TG3_PCIEPHY_PWRMGMT4_L1PLLPD_EN;
-		tg3_writephy(tp, TG3_PCIEPHY_PWRMGMT4, val);
-		udelay(10);
-
-		tp->phy_addr = phy_addr;
-	}
-
 	if ((tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS) &&
 	    tp->pci_chip_rev_id != CHIPREV_ID_5750_A0 &&
 	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5785 &&
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index be7ab91..0432399 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2032,31 +2032,9 @@
 
 
 /* Currently this is fixed. */
-#define TG3_PHY_PCIE_ADDR		0x00
 #define TG3_PHY_MII_ADDR		0x01
 
 
-/*** Tigon3 specific PHY PCIE registers. ***/
-
-#define TG3_PCIEPHY_BLOCK_ADDR		0x1f
-#define  TG3_PCIEPHY_XGXS_BLK1		0x0801
-#define  TG3_PCIEPHY_TXB_BLK		0x0861
-#define  TG3_PCIEPHY_BLOCK_SHIFT	4
-
-/* TG3_PCIEPHY_TXB_BLK */
-#define TG3_PCIEPHY_TX0CTRL1		0x15
-#define  TG3_PCIEPHY_TX0CTRL1_TXOCM	0x0003
-#define  TG3_PCIEPHY_TX0CTRL1_RDCTL	0x0008
-#define  TG3_PCIEPHY_TX0CTRL1_TXCMV	0x0030
-#define  TG3_PCIEPHY_TX0CTRL1_TKSEL	0x0040
-#define  TG3_PCIEPHY_TX0CTRL1_NB_EN	0x0400
-
-/* TG3_PCIEPHY_XGXS_BLK1 */
-#define TG3_PCIEPHY_PWRMGMT4		0x1a
-#define TG3_PCIEPHY_PWRMGMT4_L1PLLPD_EN	0x0038
-#define TG3_PCIEPHY_PWRMGMT4_LOWPWR_EN	0x4000
-
-
 /*** Tigon3 specific PHY MII registers. ***/
 #define  TG3_BMCR_SPEED1000		0x0040
 
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 3/8] tg3: Fix IPv6 TSO code in tg3_start_xmit_dma_bug()
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

The tg3_start_xmit_dma_bug() function was missing code to process IPv6
TSO packets.  This patch adds the missing support.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
---
 drivers/net/tg3.c |   20 ++++++++++++++------
 1 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 57dba79..efac448 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -5779,7 +5779,7 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
 
 	if ((mss = skb_shinfo(skb)->gso_size) != 0) {
 		struct iphdr *iph;
-		u32 tcp_opt_len, ip_tcp_len, hdr_len;
+		u32 tcp_opt_len, hdr_len;
 
 		if (skb_header_cloned(skb) &&
 		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
@@ -5787,10 +5787,21 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
 			goto out_unlock;
 		}
 
+		iph = ip_hdr(skb);
 		tcp_opt_len = tcp_optlen(skb);
-		ip_tcp_len = ip_hdrlen(skb) + sizeof(struct tcphdr);
 
-		hdr_len = ip_tcp_len + tcp_opt_len;
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
+			hdr_len = skb_headlen(skb) - ETH_HLEN;
+		} else {
+			u32 ip_tcp_len;
+
+			ip_tcp_len = ip_hdrlen(skb) + sizeof(struct tcphdr);
+			hdr_len = ip_tcp_len + tcp_opt_len;
+
+			iph->check = 0;
+			iph->tot_len = htons(mss + hdr_len);
+		}
+
 		if (unlikely((ETH_HLEN + hdr_len) > 80) &&
 			     (tp->tg3_flags2 & TG3_FLG2_TSO_BUG))
 			return tg3_tso_bug(tp, skb);
@@ -5798,9 +5809,6 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
 		base_flags |= (TXD_FLAG_CPU_PRE_DMA |
 			       TXD_FLAG_CPU_POST_DMA);
 
-		iph = ip_hdr(skb);
-		iph->check = 0;
-		iph->tot_len = htons(mss + hdr_len);
 		if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) {
 			tcp_hdr(skb)->check = 0;
 			base_flags &= ~TXD_FLAG_TCPUDP_CSUM;
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 2/8] tg3: Fix single MSI-X vector coalescing
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

The interrupt coalescing setup code used the TG3_FLG2_USING_MSIX flag to
determine whether or not to configure the rx coalescing parameters.
This is incorrect for the single MSI-X vector case.  This patch changes
the code to look at the TG3_FLG3_ENABLE_RSS instead.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 89aa486..57dba79 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -7447,7 +7447,7 @@ static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
 		tw32(HOSTCC_TXCOAL_MAXF_INT, 0);
 	}
 
-	if (!(tp->tg3_flags2 & TG3_FLG2_USING_MSIX)) {
+	if (!(tp->tg3_flags3 & TG3_FLG3_ENABLE_RSS)) {
 		tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
 		tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames);
 		tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq);
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 0/8] tg3 bugfixes
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

This patchset fixes a few MSI-X related problems and performs a
few driver enhancements.



^ permalink raw reply

* [PATCH net-next 1/8] tg3: Revert RSS indir tbl setup change
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

This patch reverts commit 2601d8a0049c8b5d29cd5adb844a305a804e505f.  A
spectacular set of coincidences made it look as though the table was
setup incorrectly.  The original version was correct.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index aa2d64f..89aa486 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -8237,7 +8237,7 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
 		for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++) {
 			int idx = i % sizeof(val);
 
-			ent[idx] = (i % (tp->irq_cnt - 1)) + 1;
+			ent[idx] = i % (tp->irq_cnt - 1);
 			if (idx == sizeof(val) - 1) {
 				tw32(reg, val);
 				reg += 4;
-- 
1.6.4.4



^ permalink raw reply related

* [PATCH net-next 4/8] tg3: Relax 5717 serdes restriction
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
  To: davem; +Cc: netdev, andy, mcarlson

tg3 is coded to refuse to attach to 5717 serdes devices.  Now that the
hardware is better supported, we can relax this restriction.  This patch
also fixes a recently introduced bug which will cause serdes parallel
detection not to work with 5780 class devices.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/tg3.c |    5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index efac448..c91049b 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -8551,7 +8551,7 @@ static void tg3_timer(unsigned long __opaque)
 				tg3_setup_phy(tp, 0);
 			}
 		} else if ((tp->tg3_flags2 & TG3_FLG2_MII_SERDES) &&
-			   !(tp->tg3_flags2 & TG3_FLG2_5780_CLASS)) {
+			   (tp->tg3_flags2 & TG3_FLG2_5780_CLASS)) {
 			tg3_serdes_parallel_detect(tp);
 		}
 
@@ -13427,8 +13427,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		return err;
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5717 &&
-	    (tp->pci_chip_rev_id != CHIPREV_ID_5717_A0 ||
-		 (tp->tg3_flags2 & TG3_FLG2_MII_SERDES)))
+	    tp->pci_chip_rev_id != CHIPREV_ID_5717_A0)
 		return -ENOTSUPP;
 
 	/* Initialize data/descriptor byte/word swapping. */
-- 
1.6.4.4



^ permalink raw reply related

* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 19:25 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
	Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <alpine.DEB.2.00.1007112214250.15736@melkinpaasi.cs.helsinki.fi>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3511 bytes --]

On Sun, 11 Jul 2010, Ilpo Järvinen wrote:

> On Sun, 11 Jul 2010, Eric Dumazet wrote:
> 
> > Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> > > Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > > > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > > > > 
> > > > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > > > Please see the attached photoshoot.  This is happening on a HPC
> > > > > > cluster and very interestingly caused by one particular job.  How long
> > > > > > it takes isn't clear yet (at least more than a day) but when it
> > > > > > happens it happens on a lot of machines in relatively short time.
> > > > > > 
> > > > > > With a bit of disassemblying, I've found that the oops is happening
> > > > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > > > NULL.
> > > > > > 
> > > > > >  void tcp_xmit_retransmit_queue(struct sock *sk)
> > > > > >  {
> > > > > >  ...
> > > > > > 	if (tp->retransmit_skb_hint) {
> > > > > > 		skb = tp->retransmit_skb_hint;
> > > > > > 		last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > > > 		if (after(last_lost, tp->retransmit_high))
> > > > > > 			last_lost = tp->retransmit_high;
> > > > > > 	} else {
> > > > > > 		skb = tcp_write_queue_head(sk);
> > > > > > 		last_lost = tp->snd_una;
> > > > > > 	}
> > > > > > 
> > > > > >  =>	tcp_for_write_queue_from(skb, sk) {
> > > > > > 		 __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > > > > 
> > > > > > 		 if (skb == tcp_send_head(sk))
> > > > > > 			 break;
> > > > > > 		 /* we could do better than to assign each time */
> > > > > > 		 if (hole == NULL)
> > > > > > 
> > > > > > This can happen for one of the following reasons,
> > > > > > 
> > > > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > > > >    too.  ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > > > >    queue for some reason.
> > > > > > 
> > > > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > > > >    write_queue.  ie. somebody forgot to update hint while removing the
> > > > > >    skb from the write queue.
> > > > > 
> > > > > Once again I've read the unlinkers through, and only thing that could 
> > > > > cause this is tcp_send_synack (others do deal with the hints) but I think 
> > > > > Eric already proposed a patch to that but we never got anywhere due to 
> > > > > some counterargument why it wouldn't take place (too far away for me to 
> > > > > remember, see archives about the discussions). ...But if you want be dead 
> > > > > sure some WARN_ON there might not hurt. Also the purging of the whole 
> > > > > queue was a similar suspect I then came across (but that would only 
> > > > > materialize with sk reuse happening e.g., with nfs which the other guys 
> > > > > weren't using).
> > > > > 
> > > > 
> > > > Hmm.
> > > > 
> > > > This sounds familiar to me, but I cannot remember the discussion you
> > > > mention or the patch.
> > > > 
> > > > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > > > SYN-ACK packet)
> 
> No. That's another thing. ...I've already found it with google today but 
> cannot seem to find it again. I thought I used tcp_make_synack eric but 
> for some reason I only get these transaction fix hits. I'll keep looking.

Right, this one:

http://kerneltrap.org/mailarchive/linux-netdev/2009/10/29/6259073

-- 
 i.

^ permalink raw reply

* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 19:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
	Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <1278872990.2538.189.camel@edumazet-laptop>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5020 bytes --]

On Sun, 11 Jul 2010, Eric Dumazet wrote:

> Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> > Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > > > 
> > > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > > Please see the attached photoshoot.  This is happening on a HPC
> > > > > cluster and very interestingly caused by one particular job.  How long
> > > > > it takes isn't clear yet (at least more than a day) but when it
> > > > > happens it happens on a lot of machines in relatively short time.
> > > > > 
> > > > > With a bit of disassemblying, I've found that the oops is happening
> > > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > > NULL.
> > > > > 
> > > > >  void tcp_xmit_retransmit_queue(struct sock *sk)
> > > > >  {
> > > > >  ...
> > > > > 	if (tp->retransmit_skb_hint) {
> > > > > 		skb = tp->retransmit_skb_hint;
> > > > > 		last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > > 		if (after(last_lost, tp->retransmit_high))
> > > > > 			last_lost = tp->retransmit_high;
> > > > > 	} else {
> > > > > 		skb = tcp_write_queue_head(sk);
> > > > > 		last_lost = tp->snd_una;
> > > > > 	}
> > > > > 
> > > > >  =>	tcp_for_write_queue_from(skb, sk) {
> > > > > 		 __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > > > 
> > > > > 		 if (skb == tcp_send_head(sk))
> > > > > 			 break;
> > > > > 		 /* we could do better than to assign each time */
> > > > > 		 if (hole == NULL)
> > > > > 
> > > > > This can happen for one of the following reasons,
> > > > > 
> > > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > > >    too.  ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > > >    queue for some reason.
> > > > > 
> > > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > > >    write_queue.  ie. somebody forgot to update hint while removing the
> > > > >    skb from the write queue.
> > > > 
> > > > Once again I've read the unlinkers through, and only thing that could 
> > > > cause this is tcp_send_synack (others do deal with the hints) but I think 
> > > > Eric already proposed a patch to that but we never got anywhere due to 
> > > > some counterargument why it wouldn't take place (too far away for me to 
> > > > remember, see archives about the discussions). ...But if you want be dead 
> > > > sure some WARN_ON there might not hurt. Also the purging of the whole 
> > > > queue was a similar suspect I then came across (but that would only 
> > > > materialize with sk reuse happening e.g., with nfs which the other guys 
> > > > weren't using).
> > > > 
> > > 
> > > Hmm.
> > > 
> > > This sounds familiar to me, but I cannot remember the discussion you
> > > mention or the patch.
> > > 
> > > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > > SYN-ACK packet)

No. That's another thing. ...I've already found it with google today but 
cannot seem to find it again. I thought I used tcp_make_synack eric but 
for some reason I only get these transaction fix hits. I'll keep looking.

> > Hmm, I cannot find where we reset restransmit_skb_hint in
> > tcp_mtu_probe(), if we call tcp_unlink_write_queue().
> > 
> > if (skb->len <= copy) {
> > 	/* We've eaten all the data from this skb.
> > 	 * Throw it away. */
> > 	TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
> > <<>>	tcp_unlink_write_queue(skb, sk);
> > 	sk_wmem_free_skb(sk, skb);
> > } else {
> > 
> > 
> > Sorry if this was already discussed. We might add a comment here in anycase ;)
> > 
> 
> Just in case, here is a patch for this issue, if Tejun wants to try it.
> 
> Thanks
> 
> [PATCH] tcp: tcp_mtu_probe() and retransmit hints
> 
> When removing an skb from tcp write queue, we must take care of various
> hints that could be kept on this skb. tcp_mtu_probe() misses this
> cleanup.
>
> lkml reference : http://lkml.org/lkml/2010/7/8/63
> 
> Reported-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index b4ed957..187453f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1666,6 +1666,9 @@ static int tcp_mtu_probe(struct sock *sk)
>  			 * Throw it away. */
>  			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
>  			tcp_unlink_write_queue(skb, sk);
> +			tcp_clear_retrans_hints_partial(tp);
> +			if (skb == tp->retransmit_skb_hint)
> +				tp->retransmit_skb_hint = nskb;

...I think we've not sent that skb just yet, so this'll never be true (and 
the rexmit loop terminates at send_head without setting it so we should 
be safe, I'll still need to check that no other code can accidently move 
it to the send_head but I doubt it happens).

>  			sk_wmem_free_skb(sk, skb);
>  		} else {
>  			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
> 
> 
> 

-- 
 i.

^ permalink raw reply

* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Eric Dumazet @ 2010-07-11 18:29 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
	Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <1278870382.2538.180.camel@edumazet-laptop>

Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > > 
> > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > Please see the attached photoshoot.  This is happening on a HPC
> > > > cluster and very interestingly caused by one particular job.  How long
> > > > it takes isn't clear yet (at least more than a day) but when it
> > > > happens it happens on a lot of machines in relatively short time.
> > > > 
> > > > With a bit of disassemblying, I've found that the oops is happening
> > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > NULL.
> > > > 
> > > >  void tcp_xmit_retransmit_queue(struct sock *sk)
> > > >  {
> > > >  ...
> > > > 	if (tp->retransmit_skb_hint) {
> > > > 		skb = tp->retransmit_skb_hint;
> > > > 		last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > 		if (after(last_lost, tp->retransmit_high))
> > > > 			last_lost = tp->retransmit_high;
> > > > 	} else {
> > > > 		skb = tcp_write_queue_head(sk);
> > > > 		last_lost = tp->snd_una;
> > > > 	}
> > > > 
> > > >  =>	tcp_for_write_queue_from(skb, sk) {
> > > > 		 __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > > 
> > > > 		 if (skb == tcp_send_head(sk))
> > > > 			 break;
> > > > 		 /* we could do better than to assign each time */
> > > > 		 if (hole == NULL)
> > > > 
> > > > This can happen for one of the following reasons,
> > > > 
> > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > >    too.  ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > >    queue for some reason.
> > > > 
> > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > >    write_queue.  ie. somebody forgot to update hint while removing the
> > > >    skb from the write queue.
> > > 
> > > Once again I've read the unlinkers through, and only thing that could 
> > > cause this is tcp_send_synack (others do deal with the hints) but I think 
> > > Eric already proposed a patch to that but we never got anywhere due to 
> > > some counterargument why it wouldn't take place (too far away for me to 
> > > remember, see archives about the discussions). ...But if you want be dead 
> > > sure some WARN_ON there might not hurt. Also the purging of the whole 
> > > queue was a similar suspect I then came across (but that would only 
> > > materialize with sk reuse happening e.g., with nfs which the other guys 
> > > weren't using).
> > > 
> > 
> > Hmm.
> > 
> > This sounds familiar to me, but I cannot remember the discussion you
> > mention or the patch.
> > 
> > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > SYN-ACK packet)
> 
> Hmm, I cannot find where we reset restransmit_skb_hint in
> tcp_mtu_probe(), if we call tcp_unlink_write_queue().
> 
> if (skb->len <= copy) {
> 	/* We've eaten all the data from this skb.
> 	 * Throw it away. */
> 	TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
> <<>>	tcp_unlink_write_queue(skb, sk);
> 	sk_wmem_free_skb(sk, skb);
> } else {
> 
> 
> Sorry if this was already discussed. We might add a comment here in anycase ;)
> 

Just in case, here is a patch for this issue, if Tejun wants to try it.

Thanks

[PATCH] tcp: tcp_mtu_probe() and retransmit hints

When removing an skb from tcp write queue, we must take care of various
hints that could be kept on this skb. tcp_mtu_probe() misses this
cleanup.

lkml reference : http://lkml.org/lkml/2010/7/8/63

Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4ed957..187453f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1666,6 +1666,9 @@ static int tcp_mtu_probe(struct sock *sk)
 			 * Throw it away. */
 			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
 			tcp_unlink_write_queue(skb, sk);
+			tcp_clear_retrans_hints_partial(tp);
+			if (skb == tp->retransmit_skb_hint)
+				tp->retransmit_skb_hint = nskb;
 			sk_wmem_free_skb(sk, skb);
 		} else {
 			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &

^ permalink raw reply related

* RE: [PATCH] bnx2x: add support for receive hashing
From: Vladislav Zolotarov @ 2010-07-11 18:18 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <1278870093.2538.175.camel@edumazet-laptop>



> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Sunday, July 11, 2010 8:42 PM
> To: Vladislav Zolotarov
> Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> 
> Le dimanche 11 juillet 2010 à 10:22 -0700, Vladislav Zolotarov a écrit :
> >
> > > -----Original Message-----
> > > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On
> > > Behalf Of Eric Dumazet
> > > Sent: Sunday, July 11, 2010 7:45 PM
> > > To: Vladislav Zolotarov
> > > Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> > > Subject: RE: [PATCH] bnx2x: add support for receive hashing
> > >
> > > Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > > > Dave, it's obvious that there a demand for a new HW/FW configuration
> > > > from our side - "rx hash enable" which is currently tightly coupled
> > > > with the RSS capability. As long as RSS flow in our FW includes a few
> > > > more things apart from just creating an RSS hash and as long as there
> > > > are flows (even hypothetical) that demand the RSS hash regardless the
> > > > RSS itself we started to work on separation of these two features from
> > > > FW perspective. This of course will demand a new FW version but once
> > > > we have it we'll be able to be more specific in HW configuration and
> > > > have a cleaner code.
> > > >
> > > > Technically, our FW may provide the Rx HASH always and in a current
> > > > driver configuration this is what it does.
> > > > I wonder if the driver always provides the HW RX HASH in the
> > > > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > > > netdev->features will it cause any harm? If not we can get rid of two
> > > > extra conditionals in the bnx2x_rx_int().
> > >
> > > Hi
> > >
> > > Please dont top-post on netdev, thanks.
> >
> > This discussion is directly related to Tom's patch that's why I'm posting
> on this thread.
> >
> 
> Thats not the question. I am saying "dont top-post", not "dont change
> the subject"
> 
> > >
> > > NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> > > particular NIC if we know the hardware provided rxhash is not fulfilling
> > > our needs (We prefer spend some cpu cycles to recompute a software
> > > rxhash).
> > >
> > > Software one for example deliver same rxhash values for both ways of a
> > > TCP flow, it can help conntracking for example.
> >
> > I understand that, in that case the proper implementation would be to check
> the netdev->features when u decide to calculate the SW rxhash, isn't it?
> >
> 
> Nope, please check get_rps_cpus()
> 
> We only do :
> 
> if (skb->rxhash)
> 	goto got_hash; /* Skip hash computation on packet header */
> 
> That means if skb->rxhash is set, we dont force a recompute.

Ok. No, prob. In that case we just need to check the netdev->feature in
the bnx2x_rx_int(). Checking on CQE flags is a not needed.
I'll post a patch shortly. 

Thanks,
vlad

> 
> Your suggestion would move a test from device driver into
> get_rps_cpus() ?
> 
> Given RPS is more targeted to old devices (not able to provide rxhash),
> I am not sure it brings anything.
> 
> > >
> > > The conditional in driver rx is cheap, since the condition is the same
> > > for all packets and CPU can predicts the branch.
> >
> > Not exactly. Our FW/HW won't provide the rxhash for none-IP packets
> > setting the hash CQE field to zero and clearing the
> > ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs
> > for instance). Branch prediction is nice but considering my previous
> > remark why do we need this branch at all?
> 
> Please limit your lines to 70 chars
> 
> We want drivers to set skb->rxhash only if allowed to.
> 
> If you feel this is bad for your firmware/hardware/driver, dont set
 skb->rxhash.
> 
> 
> 


^ permalink raw reply

* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Eric Dumazet @ 2010-07-11 17:46 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
	Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <1278867977.2538.167.camel@edumazet-laptop>

Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > 
> > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > Please see the attached photoshoot.  This is happening on a HPC
> > > cluster and very interestingly caused by one particular job.  How long
> > > it takes isn't clear yet (at least more than a day) but when it
> > > happens it happens on a lot of machines in relatively short time.
> > > 
> > > With a bit of disassemblying, I've found that the oops is happening
> > > during tcp_for_write_queue_from() because the skb->next points to
> > > NULL.
> > > 
> > >  void tcp_xmit_retransmit_queue(struct sock *sk)
> > >  {
> > >  ...
> > > 	if (tp->retransmit_skb_hint) {
> > > 		skb = tp->retransmit_skb_hint;
> > > 		last_lost = TCP_SKB_CB(skb)->end_seq;
> > > 		if (after(last_lost, tp->retransmit_high))
> > > 			last_lost = tp->retransmit_high;
> > > 	} else {
> > > 		skb = tcp_write_queue_head(sk);
> > > 		last_lost = tp->snd_una;
> > > 	}
> > > 
> > >  =>	tcp_for_write_queue_from(skb, sk) {
> > > 		 __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > 
> > > 		 if (skb == tcp_send_head(sk))
> > > 			 break;
> > > 		 /* we could do better than to assign each time */
> > > 		 if (hole == NULL)
> > > 
> > > This can happen for one of the following reasons,
> > > 
> > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > >    too.  ie. tcp_xmit_retransmit_queue() is called on an empty write
> > >    queue for some reason.
> > > 
> > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > >    write_queue.  ie. somebody forgot to update hint while removing the
> > >    skb from the write queue.
> > 
> > Once again I've read the unlinkers through, and only thing that could 
> > cause this is tcp_send_synack (others do deal with the hints) but I think 
> > Eric already proposed a patch to that but we never got anywhere due to 
> > some counterargument why it wouldn't take place (too far away for me to 
> > remember, see archives about the discussions). ...But if you want be dead 
> > sure some WARN_ON there might not hurt. Also the purging of the whole 
> > queue was a similar suspect I then came across (but that would only 
> > materialize with sk reuse happening e.g., with nfs which the other guys 
> > weren't using).
> > 
> 
> Hmm.
> 
> This sounds familiar to me, but I cannot remember the discussion you
> mention or the patch.
> 
> Or maybe it was the TCP transaction thing ? (including data in SYN or
> SYN-ACK packet)

Hmm, I cannot find where we reset restransmit_skb_hint in
tcp_mtu_probe(), if we call tcp_unlink_write_queue().

if (skb->len <= copy) {
	/* We've eaten all the data from this skb.
	 * Throw it away. */
	TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
<<>>	tcp_unlink_write_queue(skb, sk);
	sk_wmem_free_skb(sk, skb);
} else {


Sorry if this was already discussed. We might add a comment here in anycase ;)

^ permalink raw reply

* RE: [PATCH] bnx2x: add support for receive hashing
From: Eric Dumazet @ 2010-07-11 17:41 UTC (permalink / raw)
  To: Vladislav Zolotarov
  Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDE6470521@SJEXCHCCR02.corp.ad.broadcom.com>

Le dimanche 11 juillet 2010 à 10:22 -0700, Vladislav Zolotarov a écrit :
> 
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> > Behalf Of Eric Dumazet
> > Sent: Sunday, July 11, 2010 7:45 PM
> > To: Vladislav Zolotarov
> > Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> > Subject: RE: [PATCH] bnx2x: add support for receive hashing
> > 
> > Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > > Dave, it's obvious that there a demand for a new HW/FW configuration
> > > from our side - "rx hash enable" which is currently tightly coupled
> > > with the RSS capability. As long as RSS flow in our FW includes a few
> > > more things apart from just creating an RSS hash and as long as there
> > > are flows (even hypothetical) that demand the RSS hash regardless the
> > > RSS itself we started to work on separation of these two features from
> > > FW perspective. This of course will demand a new FW version but once
> > > we have it we'll be able to be more specific in HW configuration and
> > > have a cleaner code.
> > >
> > > Technically, our FW may provide the Rx HASH always and in a current
> > > driver configuration this is what it does.
> > > I wonder if the driver always provides the HW RX HASH in the
> > > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > > netdev->features will it cause any harm? If not we can get rid of two
> > > extra conditionals in the bnx2x_rx_int().
> > 
> > Hi
> > 
> > Please dont top-post on netdev, thanks.
> 
> This discussion is directly related to Tom's patch that's why I'm posting on this thread. 
> 

Thats not the question. I am saying "dont top-post", not "dont change
the subject"

> > 
> > NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> > particular NIC if we know the hardware provided rxhash is not fulfilling
> > our needs (We prefer spend some cpu cycles to recompute a software
> > rxhash).
> > 
> > Software one for example deliver same rxhash values for both ways of a
> > TCP flow, it can help conntracking for example.
> 
> I understand that, in that case the proper implementation would be to check the netdev->features when u decide to calculate the SW rxhash, isn't it?
> 

Nope, please check get_rps_cpus()

We only do :

if (skb->rxhash)
	goto got_hash; /* Skip hash computation on packet header */

That means if skb->rxhash is set, we dont force a recompute.

Your suggestion would move a test from device driver into
get_rps_cpus() ?

Given RPS is more targeted to old devices (not able to provide rxhash),
I am not sure it brings anything.

> > 
> > The conditional in driver rx is cheap, since the condition is the same
> > for all packets and CPU can predicts the branch.
> 
> Not exactly. Our FW/HW won't provide the rxhash for none-IP packets
> setting the hash CQE field to zero and clearing the
> ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs
> for instance). Branch prediction is nice but considering my previous
> remark why do we need this branch at all?

Please limit your lines to 70 chars

We want drivers to set skb->rxhash only if allowed to.

If you feel this is bad for your firmware/hardware/driver, dont set
skb->rxhash.




^ permalink raw reply

* FW: NET_NS: unregister_netdevice: waiting for lo to become free (after using openvpn) (was Re: sysfs bug when using tun with network namespaces)
From: Michael Leun @ 2010-07-11 17:29 UTC (permalink / raw)
  To: netdev; +Cc: davem, linux-kernel
In-Reply-To: <20100710235323.5336f627@xenia.leun.net>

Hi,

On Sat, 10 Jul 2010 16:52:08 +0200
Michael Leun <lkml20100708@newton.leun.net> wrote:

> # tunctl -u ml -t tap1  

> works as expected, but  

> # unshare -n /bin/bash
> # tunctl -u ml -t tap1  
[bug  (no sysfs support for net namespaces at all) solved in 2.6.35-rcX
- I used 2.6.34.1]

Now that we have solved that last one I've another glitch (this time using 2.6.35-rc4):

In an network namespace I can use an tun/tap tunnel through ssh and
when closing that namespace then eveything is fine.

But when using openvpn (also tunnel trough tun/tap) in an network
namespace and then closing that namespace I get:

unregister_netdevice: waiting for lo to become free
[repeated]

Please see the following two examples showing that difference:

# > unshare -n /bin/bash
# > # how to setup veth device pair to get connectivity into namespace not shown here
# > tunctl -u ml -t tap1
# > ssh -o Tunnel=Ethernet -w 1:1 somewhere
[ running some traffic over tap1 not shown here ]
^d # logging out from somewhere
# > tunctl -d tap1
# > exit # logging out from shell in network namespace

Now the veth device pair used automagically vanishes and nothing
from that different network namespace remains - very well.

but

# > unshare -n /bin/bash
# > # how to setup veth device pair to get connectivity into namespace not shown here
# > openvpn --config some.config
[ running some traffic over vpn device not shown here ]
^c # stopping openvpn
# > lsof -i
# > netstat -an
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address           Foreign Address         State
Active UNIX domain sockets (servers and established)
Proto RefCnt Flags       Type       State         I-Node Path
# > ps ax|grep openvpn|grep -v grep
# > # cannot find anything that suggests there is anything left from that openvpn session
# > exit # logging out from shell in network namespace

Now I get

Jul 10 20:02:36 doris kernel: unregister_netdevice: waiting for lo to become free. Usage count = 3
[repeated]

Now one might say it is fault of openvpn (used OpenVPN 2.1_rc20
i586-suse-linux - the one in openSuSE 11.2 package), openvpn didn't
close some ressource and ssh does fine.

But: should'nt kernel clean up after process when it exits?
And/or: Should'nt kernel clean up if last process in network namespace
exits - there is nothing left which might use that interface?!

Greg KH <greg@kroah.com> wrote:

> Yes, you are correct.  Care to resend all of this to the
> network-namespace developer(s) and the netdev mailing list so that the
> correct people are notified so they can fix it all?

[X] done - hopefully, cannot find a particular network namespace
developer in MAINTAINERS or source files. If such a one exists, please
forward.

Thanks.

-- 
MfG,

Michael Leun

^ permalink raw reply

* RE: [PATCH] bnx2x: add support for receive hashing
From: Vladislav Zolotarov @ 2010-07-11 17:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <1278866713.2538.148.camel@edumazet-laptop>



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Eric Dumazet
> Sent: Sunday, July 11, 2010 7:45 PM
> To: Vladislav Zolotarov
> Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
> 
> Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > Dave, it's obvious that there a demand for a new HW/FW configuration
> > from our side - "rx hash enable" which is currently tightly coupled
> > with the RSS capability. As long as RSS flow in our FW includes a few
> > more things apart from just creating an RSS hash and as long as there
> > are flows (even hypothetical) that demand the RSS hash regardless the
> > RSS itself we started to work on separation of these two features from
> > FW perspective. This of course will demand a new FW version but once
> > we have it we'll be able to be more specific in HW configuration and
> > have a cleaner code.
> >
> > Technically, our FW may provide the Rx HASH always and in a current
> > driver configuration this is what it does.
> > I wonder if the driver always provides the HW RX HASH in the
> > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > netdev->features will it cause any harm? If not we can get rid of two
> > extra conditionals in the bnx2x_rx_int().
> 
> Hi
> 
> Please dont top-post on netdev, thanks.

This discussion is directly related to Tom's patch that's why I'm posting on this thread. 

> 
> NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> particular NIC if we know the hardware provided rxhash is not fulfilling
> our needs (We prefer spend some cpu cycles to recompute a software
> rxhash).
> 
> Software one for example deliver same rxhash values for both ways of a
> TCP flow, it can help conntracking for example.

I understand that, in that case the proper implementation would be to check the netdev->features when u decide to calculate the SW rxhash, isn't it?

> 
> The conditional in driver rx is cheap, since the condition is the same
> for all packets and CPU can predicts the branch.

Not exactly. Our FW/HW won't provide the rxhash for none-IP packets setting the hash CQE field to zero and clearing the ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs for instance). Branch prediction is nice but considering my previous remark why do we need this branch at all?

> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [REGRESSION,BISECTED] Panic on ifup
From: George Spelvin @ 2010-07-11 17:09 UTC (permalink / raw)
  To: linux, timo.teras; +Cc: davem, netdev
In-Reply-To: <4C39D834.8080206@iki.fi>

> On 07/11/2010 03:38 PM, George Spelvin wrote:
>> No, although /etc/ipec-tools.conf runs setkey.  As I said,
>> I was mostly running in single-user mode; a ps axf
>> output is appended.
>> 
>> Ah!  A discovery!  There are rules for the gateway!
>> 
>> add <my_ip> <gw_ip> esp 0x200 -E aes-cbc
>> 	<key>;
>> add <gw_ip> <my_ip> esp 0x300 -E aes-cbc
>> 	<key>;
>> spdadd <gw_ip> <my_ip> any -P in ipsec
>> 	esp/transport//use;
>> spdadd <my_ip> <gw_ip> any -P out ipsec
>> 	esp/transport//use;

> This means optional encryption. Probably something goes wrong
> constructing the bundle which results in encryption not being applied.
> And it would look like xfrm_resolve_and_create_bundle() does not take
> this into account properly. I need to fix it with optional policies.
> 
> In the mean while, could confirm if everything works if you change the
> last line to:
> 	esp/transport//require;

Will do.

This might lead to no traffic if there's something else broken, however
it should not crash. This change is needed only for the 'out' policy, as
the bundles are used only on xmit code paths.

> yup, xfrm_resolve_and_create_bundle looks to be the culprit. I'll try to
> figure out a patch for testing.

> Ok, this is basically what setkey did for you. Looks like it was ran
> twice and you are missing flush and spdflush from setkey, so you get
> duplicates here. Otherwise it's ok.

Um, I am *not* missing flush and spdflush.  The entire file, with comments
and blank lines stripped, and some details censored, is:

#!/usr/sbin/setkey -f
flush;
spdflush;
add $LOCALNET.1 $LOCALNET.62 esp 0x200 -E aes-cbc <key redacted>;
add $LOCALNET.62 $LOCALNET.1 esp 0x300 -E aes-cbc <key redacted>;
add $LOCALNET.3 $LOCALNET.62 esp 0x400 -E aes-cbc <key redacted>;
add $LOCALNET.62 $LOCALNET.3 esp 0x500 -E aes-cbc <key redacted>;
spdadd $LOCALNET.1 $LOCALNET.62 any -P in ipsec esp/transport//use;
spdadd $LOCALNET.62 $LOCALNET.1 any -P out ipsec esp/transport//use;
spdadd $LOCALNET.3 $LOCALNET.62 any -P in ipsec esp/transport//use;
spdadd $LOCALNET.62 $LOCALNET.3 any -P out ipsec esp/transport//use;

Anyway, thank you very much!

^ permalink raw reply

* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Eric Dumazet @ 2010-07-11 17:06 UTC (permalink / raw)
  To: Ilpo Järvinen
  Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
	Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <alpine.DEB.2.00.1007111825510.15736@melkinpaasi.cs.helsinki.fi>

Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> On Thu, 8 Jul 2010, Tejun Heo wrote:
> 
> > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > Please see the attached photoshoot.  This is happening on a HPC
> > cluster and very interestingly caused by one particular job.  How long
> > it takes isn't clear yet (at least more than a day) but when it
> > happens it happens on a lot of machines in relatively short time.
> > 
> > With a bit of disassemblying, I've found that the oops is happening
> > during tcp_for_write_queue_from() because the skb->next points to
> > NULL.
> > 
> >  void tcp_xmit_retransmit_queue(struct sock *sk)
> >  {
> >  ...
> > 	if (tp->retransmit_skb_hint) {
> > 		skb = tp->retransmit_skb_hint;
> > 		last_lost = TCP_SKB_CB(skb)->end_seq;
> > 		if (after(last_lost, tp->retransmit_high))
> > 			last_lost = tp->retransmit_high;
> > 	} else {
> > 		skb = tcp_write_queue_head(sk);
> > 		last_lost = tp->snd_una;
> > 	}
> > 
> >  =>	tcp_for_write_queue_from(skb, sk) {
> > 		 __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > 
> > 		 if (skb == tcp_send_head(sk))
> > 			 break;
> > 		 /* we could do better than to assign each time */
> > 		 if (hole == NULL)
> > 
> > This can happen for one of the following reasons,
> > 
> > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> >    too.  ie. tcp_xmit_retransmit_queue() is called on an empty write
> >    queue for some reason.
> > 
> > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> >    write_queue.  ie. somebody forgot to update hint while removing the
> >    skb from the write queue.
> 
> Once again I've read the unlinkers through, and only thing that could 
> cause this is tcp_send_synack (others do deal with the hints) but I think 
> Eric already proposed a patch to that but we never got anywhere due to 
> some counterargument why it wouldn't take place (too far away for me to 
> remember, see archives about the discussions). ...But if you want be dead 
> sure some WARN_ON there might not hurt. Also the purging of the whole 
> queue was a similar suspect I then came across (but that would only 
> materialize with sk reuse happening e.g., with nfs which the other guys 
> weren't using).
> 

Hmm.

This sounds familiar to me, but I cannot remember the discussion you
mention or the patch.

Or maybe it was the TCP transaction thing ? (including data in SYN or
SYN-ACK packet)

> > 3. The hint is pointing to a skb on the list but the list itself is
> >    corrupt.
> > 
> > I added some debug code and the crash is happening when
> > tp->retransmit_skb_hint is not NULL but tp->retransmit_skb_hint->next
> > is NULL.  So, #1 is out; unfortunately, I didn't have debug code in
> > place to discern between #2 and #3.
> > 
> > Does anything ring a bell?  This is a production system and debugging
> > affects quite a number of people.  I can put debug code in to discern
> > between #2 and #3 but I'm basically shooting in the dark and it would
> > be great if someone has a better idea.
> 
> Thanks for taking this up. I've been kind of waiting somebody to show up 
> who actually has some way of reproducing it. Once I had one guy in the 
> hook but his ability to reproduce was for some reason lost when he tried 
> with a debug patch [1]. 
> 
> I now realize that the debug patch should probably also print the write 
> queue too when the problem is caught in order to discern the cases you 
> mention.
> 
> Something along these lines:
> 
> tcp_for_write_queue(skb, sk) {
> 	printk("skb %p (%u-%u) next %p prev %p sacked %u\n", ...);
> }
> 
> Anyway, my debugging patch should be such that in a lucky case it avoids 
> crashing the system too, though price to pay might then be a stuck 
> connection. In case #3 I'd expect the box to die elsewhere in TCP code 
> pretty soon anyway so it depends whether avoiding oops is really so 
> useful, but if you're lucky other mechanism in TCP will recover 
> the lost one for you (basically RTO driven retransmission).
> 

^ permalink raw reply

* RE: [PATCH] bnx2x: add support for receive hashing
From: Eric Dumazet @ 2010-07-11 16:45 UTC (permalink / raw)
  To: Vladislav Zolotarov
  Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDE64704FA@SJEXCHCCR02.corp.ad.broadcom.com>

Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> Dave, it's obvious that there a demand for a new HW/FW configuration
> from our side - "rx hash enable" which is currently tightly coupled
> with the RSS capability. As long as RSS flow in our FW includes a few
> more things apart from just creating an RSS hash and as long as there
> are flows (even hypothetical) that demand the RSS hash regardless the
> RSS itself we started to work on separation of these two features from
> FW perspective. This of course will demand a new FW version but once
> we have it we'll be able to be more specific in HW configuration and
> have a cleaner code.
> 
> Technically, our FW may provide the Rx HASH always and in a current
> driver configuration this is what it does.
> I wonder if the driver always provides the HW RX HASH in the
> skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> netdev->features will it cause any harm? If not we can get rid of two
> extra conditionals in the bnx2x_rx_int().

Hi

Please dont top-post on netdev, thanks.

NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
particular NIC if we know the hardware provided rxhash is not fulfilling
our needs (We prefer spend some cpu cycles to recompute a software
rxhash).

Software one for example deliver same rxhash values for both ways of a
TCP flow, it can help conntracking for example.

The conditional in driver rx is cheap, since the condition is the same
for all packets and CPU can predicts the branch.




^ permalink raw reply

* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 16:09 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David S. Miller, lkml, netdev@vger.kernel.org, Fehrmann, Henning,
	Carsten Aulbert, Eric Dumazet
In-Reply-To: <4C358AAA.9080400@kernel.org>

On Thu, 8 Jul 2010, Tejun Heo wrote:

> We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> Please see the attached photoshoot.  This is happening on a HPC
> cluster and very interestingly caused by one particular job.  How long
> it takes isn't clear yet (at least more than a day) but when it
> happens it happens on a lot of machines in relatively short time.
> 
> With a bit of disassemblying, I've found that the oops is happening
> during tcp_for_write_queue_from() because the skb->next points to
> NULL.
> 
>  void tcp_xmit_retransmit_queue(struct sock *sk)
>  {
>  ...
> 	if (tp->retransmit_skb_hint) {
> 		skb = tp->retransmit_skb_hint;
> 		last_lost = TCP_SKB_CB(skb)->end_seq;
> 		if (after(last_lost, tp->retransmit_high))
> 			last_lost = tp->retransmit_high;
> 	} else {
> 		skb = tcp_write_queue_head(sk);
> 		last_lost = tp->snd_una;
> 	}
> 
>  =>	tcp_for_write_queue_from(skb, sk) {
> 		 __u8 sacked = TCP_SKB_CB(skb)->sacked;
> 
> 		 if (skb == tcp_send_head(sk))
> 			 break;
> 		 /* we could do better than to assign each time */
> 		 if (hole == NULL)
> 
> This can happen for one of the following reasons,
> 
> 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
>    too.  ie. tcp_xmit_retransmit_queue() is called on an empty write
>    queue for some reason.
> 
> 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
>    write_queue.  ie. somebody forgot to update hint while removing the
>    skb from the write queue.

Once again I've read the unlinkers through, and only thing that could 
cause this is tcp_send_synack (others do deal with the hints) but I think 
Eric already proposed a patch to that but we never got anywhere due to 
some counterargument why it wouldn't take place (too far away for me to 
remember, see archives about the discussions). ...But if you want be dead 
sure some WARN_ON there might not hurt. Also the purging of the whole 
queue was a similar suspect I then came across (but that would only 
materialize with sk reuse happening e.g., with nfs which the other guys 
weren't using).

> 3. The hint is pointing to a skb on the list but the list itself is
>    corrupt.
> 
> I added some debug code and the crash is happening when
> tp->retransmit_skb_hint is not NULL but tp->retransmit_skb_hint->next
> is NULL.  So, #1 is out; unfortunately, I didn't have debug code in
> place to discern between #2 and #3.
> 
> Does anything ring a bell?  This is a production system and debugging
> affects quite a number of people.  I can put debug code in to discern
> between #2 and #3 but I'm basically shooting in the dark and it would
> be great if someone has a better idea.

Thanks for taking this up. I've been kind of waiting somebody to show up 
who actually has some way of reproducing it. Once I had one guy in the 
hook but his ability to reproduce was for some reason lost when he tried 
with a debug patch [1]. 

I now realize that the debug patch should probably also print the write 
queue too when the problem is caught in order to discern the cases you 
mention.

Something along these lines:

tcp_for_write_queue(skb, sk) {
	printk("skb %p (%u-%u) next %p prev %p sacked %u\n", ...);
}

Anyway, my debugging patch should be such that in a lucky case it avoids 
crashing the system too, though price to pay might then be a stuck 
connection. In case #3 I'd expect the box to die elsewhere in TCP code 
pretty soon anyway so it depends whether avoiding oops is really so 
useful, but if you're lucky other mechanism in TCP will recover 
the lost one for you (basically RTO driven retransmission).

-- 
 i.

[1] http://marc.info/?l=linux-kernel&m=126624014117610&w=2

^ permalink raw reply

* [PATCHv3] extensions: libxt_CHECKSUM extension
From: Michael S. Tsirkin @ 2010-07-11 15:14 UTC (permalink / raw)
  To: Patrick McHardy, David S. Miller, Jan Engelhardt, Randy Dunlap,
	netfilter-devel
In-Reply-To: <20100711150623.GA7420@redhat.com>

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to disable
checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
        -j CHECKSUM --checksum-fill

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Correction in the documentation. Sorry about the noise.

Changes from v2:
	updated man file
Changes from v1:
	switched from ipt to xt

 extensions/libxt_CHECKSUM.c   |   99 +++++++++++++++++++++++++++++++++++++++++
 extensions/libxt_CHECKSUM.man |    8 +++
 2 files changed, 107 insertions(+), 0 deletions(-)
 create mode 100644 extensions/libxt_CHECKSUM.c
 create mode 100644 extensions/libxt_CHECKSUM.man

diff --git a/extensions/libxt_CHECKSUM.c b/extensions/libxt_CHECKSUM.c
new file mode 100644
index 0000000..00fbd8f
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.c
@@ -0,0 +1,99 @@
+/* Shared library add-on to xtables for CHECKSUM
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 by Red Hat, Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is distributed under the terms of GNU GPL v2, 1991
+ *
+ * libxt_CHECKSUM.c borrowed some bits from libipt_ECN.c
+ *
+ * $Id$
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <xtables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+static void CHECKSUM_help(void)
+{
+	printf(
+"CHECKSUM target options\n"
+"  --checksum-fill			Fill in packet checksum.\n");
+}
+
+static const struct option CHECKSUM_opts[] = {
+	{ "checksum-fill", 0, NULL, 'F' },
+	{ .name = NULL }
+};
+
+static int CHECKSUM_parse(int c, char **argv, int invert, unsigned int *flags,
+                     const void *entry, struct xt_entry_target **target)
+{
+	struct xt_CHECKSUM_info *einfo
+		= (struct xt_CHECKSUM_info *)(*target)->data;
+
+	switch (c) {
+	case 'F':
+		if (*flags)
+			xtables_error(PARAMETER_PROBLEM,
+			        "CHECKSUM target: Only use --checksum-fill ONCE!");
+		einfo->operation = XT_CHECKSUM_OP_FILL;
+		*flags |= XT_CHECKSUM_OP_FILL;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static void CHECKSUM_check(unsigned int flags)
+{
+	if (!flags)
+		xtables_error(PARAMETER_PROBLEM,
+		           "CHECKSUM target: Parameter --checksum-fill is required");
+}
+
+static void CHECKSUM_print(const void *ip, const struct xt_entry_target *target,
+                      int numeric)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	printf("CHECKSUM ");
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("fill ");
+}
+
+static void CHECKSUM_save(const void *ip, const struct xt_entry_target *target)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("--checksum-fill ");
+}
+
+static struct xtables_target checksum_tg_reg = {
+	.name		= "CHECKSUM",
+	.version	= XTABLES_VERSION,
+	.family		= NFPROTO_UNSPEC,
+	.size		= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.userspacesize	= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.help		= CHECKSUM_help,
+	.parse		= CHECKSUM_parse,
+	.final_check	= CHECKSUM_check,
+	.print		= CHECKSUM_print,
+	.save		= CHECKSUM_save,
+	.extra_opts	= CHECKSUM_opts,
+};
+
+void _init(void)
+{
+	xtables_register_target(&checksum_tg_reg);
+}
diff --git a/extensions/libxt_CHECKSUM.man b/extensions/libxt_CHECKSUM.man
new file mode 100644
index 0000000..92ae700
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.man
@@ -0,0 +1,8 @@
+This target allows to selectively work around broken/old applications.
+It can only be used in the mangle table.
+.TP
+\fB\-\-checksum\-fill\fP
+Compute and fill in the checksum in a packet that lacks a checksum.
+This is particularly useful, if you need to work around old applications
+such as dhcp clients, that do not work well with checksum offloads,
+but don't want to disable checksum offload in your device.
-- 
1.7.2.rc0.14.g41c1c

^ permalink raw reply related

* [PATCHv2] extensions: libxt_CHECKSUM extension
From: Michael S. Tsirkin @ 2010-07-11 15:08 UTC (permalink / raw)
  To: Patrick McHardy, David S. Miller, Jan Engelhardt, Randy Dunlap,
	netfilter-devel
In-Reply-To: <20100711150623.GA7420@redhat.com>

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to disable
checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1:
	switched from ipt to xt target

 extensions/libipt_CHECKSUM.man |    8 +++
 extensions/libxt_CHECKSUM.c    |   99 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 0 deletions(-)
 create mode 100644 extensions/libipt_CHECKSUM.man
 create mode 100644 extensions/libxt_CHECKSUM.c

diff --git a/extensions/libipt_CHECKSUM.man b/extensions/libipt_CHECKSUM.man
new file mode 100644
index 0000000..4f83335
--- /dev/null
+++ b/extensions/libipt_CHECKSUM.man
@@ -0,0 +1,8 @@
+This target allows to selectively work around broken/old applications.
+It can only be used in the mangle table.
+.TP
+\fB\-\-checksum\-fill\fP
+Compute and fill in the checksum in a packet that lacks a checksum.
+This is particularly useful, if you need to work around old applications
+such as dhcp clients, that do not work well with checksum offloads,
+but don't want to disable checksum offload in your device.
diff --git a/extensions/libxt_CHECKSUM.c b/extensions/libxt_CHECKSUM.c
new file mode 100644
index 0000000..00fbd8f
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.c
@@ -0,0 +1,99 @@
+/* Shared library add-on to xtables for CHECKSUM
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 by Red Hat, Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is distributed under the terms of GNU GPL v2, 1991
+ *
+ * libxt_CHECKSUM.c borrowed some bits from libipt_ECN.c
+ *
+ * $Id$
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <xtables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+static void CHECKSUM_help(void)
+{
+	printf(
+"CHECKSUM target options\n"
+"  --checksum-fill			Fill in packet checksum.\n");
+}
+
+static const struct option CHECKSUM_opts[] = {
+	{ "checksum-fill", 0, NULL, 'F' },
+	{ .name = NULL }
+};
+
+static int CHECKSUM_parse(int c, char **argv, int invert, unsigned int *flags,
+                     const void *entry, struct xt_entry_target **target)
+{
+	struct xt_CHECKSUM_info *einfo
+		= (struct xt_CHECKSUM_info *)(*target)->data;
+
+	switch (c) {
+	case 'F':
+		if (*flags)
+			xtables_error(PARAMETER_PROBLEM,
+			        "CHECKSUM target: Only use --checksum-fill ONCE!");
+		einfo->operation = XT_CHECKSUM_OP_FILL;
+		*flags |= XT_CHECKSUM_OP_FILL;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static void CHECKSUM_check(unsigned int flags)
+{
+	if (!flags)
+		xtables_error(PARAMETER_PROBLEM,
+		           "CHECKSUM target: Parameter --checksum-fill is required");
+}
+
+static void CHECKSUM_print(const void *ip, const struct xt_entry_target *target,
+                      int numeric)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	printf("CHECKSUM ");
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("fill ");
+}
+
+static void CHECKSUM_save(const void *ip, const struct xt_entry_target *target)
+{
+	const struct xt_CHECKSUM_info *einfo =
+		(const struct xt_CHECKSUM_info *)target->data;
+
+	if (einfo->operation & XT_CHECKSUM_OP_FILL)
+		printf("--checksum-fill ");
+}
+
+static struct xtables_target checksum_tg_reg = {
+	.name		= "CHECKSUM",
+	.version	= XTABLES_VERSION,
+	.family		= NFPROTO_UNSPEC,
+	.size		= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.userspacesize	= XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+	.help		= CHECKSUM_help,
+	.parse		= CHECKSUM_parse,
+	.final_check	= CHECKSUM_check,
+	.print		= CHECKSUM_print,
+	.save		= CHECKSUM_save,
+	.extra_opts	= CHECKSUM_opts,
+};
+
+void _init(void)
+{
+	xtables_register_target(&checksum_tg_reg);
+}

^ permalink raw reply related

* [PATCHv2] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 15:06 UTC (permalink / raw)
  To: Patrick McHardy, Michael S. Tsirkin, David S. Miller,
	Jan Engelhardt, Randy Dunlap

This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.

You can use this target to compute and fill in the checksum in
a packet that lacks a checksum.  This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to
disable checksum offload in your device.

The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html

Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
	-j CHECKSUM --checksum-fill

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1:
	moved from ipt to xt
	get rid of any ipv4 dependencies
	coding style tweaks

 include/linux/netfilter/xt_CHECKSUM.h |   18 ++++++++
 net/netfilter/Kconfig                 |   17 +++++++-
 net/netfilter/Makefile                |    1 +
 net/netfilter/xt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/netfilter/xt_CHECKSUM.h
 create mode 100644 net/netfilter/xt_CHECKSUM.c

diff --git a/include/linux/netfilter/xt_CHECKSUM.h b/include/linux/netfilter/xt_CHECKSUM.h
new file mode 100644
index 0000000..56afe57
--- /dev/null
+++ b/include/linux/netfilter/xt_CHECKSUM.h
@@ -0,0 +1,18 @@
+/* Header file for iptables ipt_CHECKSUM target
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 Red Hat Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This software is distributed under GNU GPL v2, 1991
+*/
+#ifndef _IPT_CHECKSUM_TARGET_H
+#define _IPT_CHECKSUM_TARGET_H
+
+#define XT_CHECKSUM_OP_FILL	0x01	/* fill in checksum in IP header */
+
+struct xt_CHECKSUM_info {
+	u_int8_t operation;	/* bitset of operations */
+};
+
+#endif /* _IPT_CHECKSUM_TARGET_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8593a77..1cf4852 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -294,7 +294,7 @@ endif # NF_CONNTRACK
 config NETFILTER_TPROXY
 	tristate "Transparent proxying support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-	depends on IP_NF_MANGLE
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
 	depends on NETFILTER_ADVANCED
 	help
 	  This option enables transparent proxying support, that is,
@@ -347,6 +347,21 @@ config NETFILTER_XT_CONNMARK
 
 comment "Xtables targets"
 
+config NETFILTER_XT_TARGET_CHECKSUM
+	tristate "CHECKSUM target support"
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+	  table.  
+
+	  You can use this target to compute and fill in the checksum in
+	  a packet that lacks a checksum.  This is particularly useful,
+	  if you need to work around old applications such as dhcp clients,
+	  that do not work well with checksum offloads, but don't want to disable
+	  checksum offload in your device.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_CLASSIFY
 	tristate '"CLASSIFY" target support'
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 14e3a8f..8eb541d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 
 # targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 0000000..0fee1a7
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,72 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb_checksum_help(skb);
+
+	return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+	if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+		pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+		return -EINVAL;
+	}
+	if (!einfo->operation) {
+		pr_info("no CHECKSUM operation enabled\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+	.name		= "CHECKSUM",
+	.family		= NFPROTO_UNSPEC,
+	.target		= checksum_tg,
+	.targetsize	= sizeof(struct xt_CHECKSUM_info),
+	.table		= "mangle",
+	.checkentry	= checksum_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+	return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+	xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
-- 
1.7.2.rc0.14.g41c1c

^ permalink raw reply related

* Re: [PATCH] netfilter: add CHECKSUM target
From: Michael S. Tsirkin @ 2010-07-11 13:19 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jan Engelhardt, Patrick McHardy, David S. Miller,
	Alexey Kuznetsov, Pekka Savola (ipv6), James Morris,
	Hideaki YOSHIFUJI, linux-kernel, netfilter-devel, netfilter,
	coreteam, netdev, herbert.xu, kvm
In-Reply-To: <AANLkTikamhUUuoQ8EjwPn9gB4zHZkeDuzpiqd4XcN3lx@mail.gmail.com>

On Sun, Jul 11, 2010 at 08:45:01PM +0800, Changli Gao wrote:
> On Sun, Jul 11, 2010 at 8:27 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Fri, Jul 09, 2010 at 11:56:26AM +0200, Jan Engelhardt wrote:
> >>
> >> On Friday 2010-07-09 00:29, Michael S. Tsirkin wrote:
> >> >
> >> > include/linux/netfilter_ipv4/ipt_CHECKSUM.h |   18 +++++++
> >> > net/ipv4/netfilter/Kconfig                  |   16 ++++++
> >> > net/ipv4/netfilter/Makefile                 |    1 +
> >> > net/ipv4/netfilter/ipt_CHECKSUM.c           |   72 +++++++++++++++++++++++++++
> >>
> >> New modules should use xt.
> >
> > I tried moving the module to xt_CHECKSUM but now
> >        iptables -A POSTROUTING -t mangle -p udp --dport 68 -j CHECKSUM --checksum-fill
> > does not to load the module.
> > It seems that xt_request_find_target in x_tables uses the prefix 'ip' for a module.
> > What am I missing?
> >
> 
> You should add MODULE_ALIAS clause, i.e.
> 
> MODULE_ALIAS("ipt_CHECKSUM");

This worked, thanks!

> -- 
> Regards,
> Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox