Netdev List
 help / color / mirror / Atom feed
* [net-next-2.6 PATCH] ixgbe: use netif_<level> instead of netdev_<level>
From: Jeff Kirsher @ 2010-07-02  6:05 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, Joe Perches, Emil Tantilov, Jeff Kirsher

From: Emil Tantilov <emil.s.tantilov@intel.com>

This patch restores the ability to set msglvl through ethtool.
The issue was introduced by:
commit 849c45423c0c108e08d67644728cc9b0ed225fa1

CC: Joe Perches <joe@perches.com>

Reported-by: Joe Perches <joe@perches.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_82599.c   |    2 -
 drivers/net/ixgbe/ixgbe_common.h  |   19 ++++-----
 drivers/net/ixgbe/ixgbe_dcb_nl.c  |    2 -
 drivers/net/ixgbe/ixgbe_ethtool.c |   40 ++++++++++---------
 drivers/net/ixgbe/ixgbe_fcoe.c    |   26 ++++++------
 drivers/net/ixgbe/ixgbe_main.c    |   79 +++++++++++++++++++------------------
 drivers/net/ixgbe/ixgbe_sriov.c   |    7 ++-
 7 files changed, 89 insertions(+), 86 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index 0ee175a..3e06a61 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -715,7 +715,7 @@ static s32 ixgbe_setup_mac_link_smartspeed(struct ixgbe_hw *hw,
 
 out:
 	if (link_up && (link_speed == IXGBE_LINK_SPEED_1GB_FULL))
-		e_info("Smartspeed has downgraded the link speed from "
+		e_info(hw, "Smartspeed has downgraded the link speed from "
 		       "the maximum advertised\n");
 	return status;
 }
diff --git a/drivers/net/ixgbe/ixgbe_common.h b/drivers/net/ixgbe/ixgbe_common.h
index d5d3aae..5cf15aa 100644
--- a/drivers/net/ixgbe/ixgbe_common.h
+++ b/drivers/net/ixgbe/ixgbe_common.h
@@ -108,16 +108,6 @@ s32 ixgbe_blink_led_stop_generic(struct ixgbe_hw *hw, u32 index);
 extern struct net_device *ixgbe_get_hw_dev(struct ixgbe_hw *hw);
 #define hw_dbg(hw, format, arg...) \
 	netdev_dbg(ixgbe_get_hw_dev(hw), format, ##arg)
-#define e_err(format, arg...) \
-	netdev_err(adapter->netdev, format, ## arg)
-#define e_info(format, arg...) \
-	netdev_info(adapter->netdev, format, ## arg)
-#define e_warn(format, arg...) \
-	netdev_warn(adapter->netdev, format, ## arg)
-#define e_notice(format, arg...) \
-	netdev_notice(adapter->netdev, format, ## arg)
-#define e_crit(format, arg...) \
-	netdev_crit(adapter->netdev, format, ## arg)
 #define e_dev_info(format, arg...) \
 	dev_info(&adapter->pdev->dev, format, ## arg)
 #define e_dev_warn(format, arg...) \
@@ -126,5 +116,12 @@ extern struct net_device *ixgbe_get_hw_dev(struct ixgbe_hw *hw);
 	dev_err(&adapter->pdev->dev, format, ## arg)
 #define e_dev_notice(format, arg...) \
 	dev_notice(&adapter->pdev->dev, format, ## arg)
-
+#define e_info(msglvl, format, arg...) \
+	netif_info(adapter, msglvl, adapter->netdev, format, ## arg)
+#define e_err(msglvl, format, arg...) \
+	netif_err(adapter, msglvl, adapter->netdev, format, ## arg)
+#define e_warn(msglvl, format, arg...) \
+	netif_warn(adapter, msglvl, adapter->netdev, format, ## arg)
+#define e_crit(msglvl, format, arg...) \
+	netif_crit(adapter, msglvl, adapter->netdev, format, ## arg)
 #endif /* IXGBE_COMMON */
diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index 6576235..b53b465 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -121,7 +121,7 @@ static u8 ixgbe_dcbnl_set_state(struct net_device *netdev, u8 state)
 			goto out;
 
 		if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED)) {
-			e_err("Enable failed, needs MSI-X\n");
+			e_err(drv, "Enable failed, needs MSI-X\n");
 			err = 1;
 			goto out;
 		}
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 5275e9c..b35ef36 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -301,7 +301,7 @@ static int ixgbe_set_settings(struct net_device *netdev,
 		hw->mac.autotry_restart = true;
 		err = hw->mac.ops.setup_link(hw, advertised, true, true);
 		if (err) {
-			e_info("setup link failed with code %d\n", err);
+			e_info(probe, "setup link failed with code %d\n", err);
 			hw->mac.ops.setup_link(hw, old, true, true);
 		}
 	} else {
@@ -1194,8 +1194,8 @@ static struct ixgbe_reg_test reg_test_82598[] = {
 		writel((_test[pat] & W), (adapter->hw.hw_addr + R));          \
 		val = readl(adapter->hw.hw_addr + R);                         \
 		if (val != (_test[pat] & W & M)) {                            \
-			e_err("pattern test reg %04X failed: got "	\
-			      "0x%08X expected 0x%08X\n",		\
+			e_err(drv, "pattern test reg %04X failed: got "   \
+			      "0x%08X expected 0x%08X\n",		      \
 			      R, val, (_test[pat] & W & M));                \
 			*data = R;                                            \
 			writel(before, adapter->hw.hw_addr + R);              \
@@ -1212,8 +1212,8 @@ static struct ixgbe_reg_test reg_test_82598[] = {
 	writel((W & M), (adapter->hw.hw_addr + R));                           \
 	val = readl(adapter->hw.hw_addr + R);                                 \
 	if ((W & M) != (val & M)) {                                           \
-		e_err("set/check reg %04X test failed: got 0x%08X "	\
-		      "expected 0x%08X\n", R, (val & M), (W & M));	\
+		e_err(drv, "set/check reg %04X test failed: got 0x%08X "  \
+		      "expected 0x%08X\n", R, (val & M), (W & M));        \
 		*data = R;                                                    \
 		writel(before, (adapter->hw.hw_addr + R));                    \
 		return 1;                                                     \
@@ -1246,8 +1246,8 @@ static int ixgbe_reg_test(struct ixgbe_adapter *adapter, u64 *data)
 	IXGBE_WRITE_REG(&adapter->hw, IXGBE_STATUS, toggle);
 	after = IXGBE_READ_REG(&adapter->hw, IXGBE_STATUS) & toggle;
 	if (value != after) {
-		e_err("failed STATUS register test got: 0x%08X expected: "
-		      "0x%08X\n", after, value);
+		e_err(drv, "failed STATUS register test got: 0x%08X "
+		      "expected: 0x%08X\n", after, value);
 		*data = 1;
 		return 1;
 	}
@@ -1347,8 +1347,8 @@ static int ixgbe_intr_test(struct ixgbe_adapter *adapter, u64 *data)
 		*data = 1;
 		return -1;
 	}
-	e_info("testing %s interrupt\n", shared_int ?
-		   "shared" : "unshared");
+	e_info(hw, "testing %s interrupt\n", shared_int ?
+	       "shared" : "unshared");
 
 	/* Disable all the interrupts */
 	IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, 0xFFFFFFFF);
@@ -1853,7 +1853,7 @@ static void ixgbe_diag_test(struct net_device *netdev,
 	if (eth_test->flags == ETH_TEST_FL_OFFLINE) {
 		/* Offline tests */
 
-		e_info("offline testing starting\n");
+		e_info(hw, "offline testing starting\n");
 
 		/* Link test performed before hardware reset so autoneg doesn't
 		 * interfere with test result */
@@ -1886,17 +1886,17 @@ static void ixgbe_diag_test(struct net_device *netdev,
 		else
 			ixgbe_reset(adapter);
 
-		e_info("register testing starting\n");
+		e_info(hw, "register testing starting\n");
 		if (ixgbe_reg_test(adapter, &data[0]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
 		ixgbe_reset(adapter);
-		e_info("eeprom testing starting\n");
+		e_info(hw, "eeprom testing starting\n");
 		if (ixgbe_eeprom_test(adapter, &data[1]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
 		ixgbe_reset(adapter);
-		e_info("interrupt testing starting\n");
+		e_info(hw, "interrupt testing starting\n");
 		if (ixgbe_intr_test(adapter, &data[2]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
@@ -1904,13 +1904,14 @@ static void ixgbe_diag_test(struct net_device *netdev,
 		 * loopback diagnostic. */
 		if (adapter->flags & (IXGBE_FLAG_SRIOV_ENABLED |
 				      IXGBE_FLAG_VMDQ_ENABLED)) {
-			e_info("Skip MAC loopback diagnostic in VT mode\n");
+			e_info(hw, "Skip MAC loopback diagnostic in VT "
+			       "mode\n");
 			data[3] = 0;
 			goto skip_loopback;
 		}
 
 		ixgbe_reset(adapter);
-		e_info("loopback testing starting\n");
+		e_info(hw, "loopback testing starting\n");
 		if (ixgbe_loopback_test(adapter, &data[3]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
@@ -1921,7 +1922,7 @@ skip_loopback:
 		if (if_running)
 			dev_open(netdev);
 	} else {
-		e_info("online testing starting\n");
+		e_info(hw, "online testing starting\n");
 		/* Online tests */
 		if (ixgbe_link_test(adapter, &data[4]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
@@ -2139,7 +2140,8 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
 			adapter->flags2 &= ~IXGBE_FLAG2_RSC_ENABLED;
 			if (netdev->features & NETIF_F_LRO) {
 				netdev->features &= ~NETIF_F_LRO;
-				e_info("rx-usecs set to 0, disabling RSC\n");
+				e_info(probe, "rx-usecs set to 0, "
+				       "disabling RSC\n");
 			}
 			need_reset = true;
 		}
@@ -2239,8 +2241,8 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
 		} else if (!adapter->rx_itr_setting) {
 			netdev->features &= ~NETIF_F_LRO;
 			if (data & ETH_FLAG_LRO)
-				e_info("rx-usecs set to 0, "
-					"LRO/RSC cannot be enabled.\n");
+				e_info(probe, "rx-usecs set to 0, "
+				       "LRO/RSC cannot be enabled.\n");
 		}
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_fcoe.c b/drivers/net/ixgbe/ixgbe_fcoe.c
index 84e1194..f6ef4cd 100644
--- a/drivers/net/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ixgbe/ixgbe_fcoe.c
@@ -164,20 +164,20 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 
 	adapter = netdev_priv(netdev);
 	if (xid >= IXGBE_FCOE_DDP_MAX) {
-		e_warn("xid=0x%x out-of-range\n", xid);
+		e_warn(drv, "xid=0x%x out-of-range\n", xid);
 		return 0;
 	}
 
 	fcoe = &adapter->fcoe;
 	if (!fcoe->pool) {
-		e_warn("xid=0x%x no ddp pool for fcoe\n", xid);
+		e_warn(drv, "xid=0x%x no ddp pool for fcoe\n", xid);
 		return 0;
 	}
 
 	ddp = &fcoe->ddp[xid];
 	if (ddp->sgl) {
-		e_err("xid 0x%x w/ non-null sgl=%p nents=%d\n",
-			  xid, ddp->sgl, ddp->sgc);
+		e_err(drv, "xid 0x%x w/ non-null sgl=%p nents=%d\n",
+		      xid, ddp->sgl, ddp->sgc);
 		return 0;
 	}
 	ixgbe_fcoe_clear_ddp(ddp);
@@ -185,14 +185,14 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 	/* setup dma from scsi command sgl */
 	dmacount = pci_map_sg(adapter->pdev, sgl, sgc, DMA_FROM_DEVICE);
 	if (dmacount == 0) {
-		e_err("xid 0x%x DMA map error\n", xid);
+		e_err(drv, "xid 0x%x DMA map error\n", xid);
 		return 0;
 	}
 
 	/* alloc the udl from our ddp pool */
 	ddp->udl = pci_pool_alloc(fcoe->pool, GFP_KERNEL, &ddp->udp);
 	if (!ddp->udl) {
-		e_err("failed allocated ddp context\n");
+		e_err(drv, "failed allocated ddp context\n");
 		goto out_noddp_unmap;
 	}
 	ddp->sgl = sgl;
@@ -205,7 +205,7 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 		while (len) {
 			/* max number of buffers allowed in one DDP context */
 			if (j >= IXGBE_BUFFCNT_MAX) {
-				e_err("xid=%x:%d,%d,%d:addr=%llx "
+				e_err(drv, "xid=%x:%d,%d,%d:addr=%llx "
 				      "not enough descriptors\n",
 				      xid, i, j, dmacount, (u64)addr);
 				goto out_noddp_free;
@@ -385,7 +385,7 @@ int ixgbe_fso(struct ixgbe_adapter *adapter,
 	struct fc_frame_header *fh;
 
 	if (skb_is_gso(skb) && (skb_shinfo(skb)->gso_type != SKB_GSO_FCOE)) {
-		e_err("Wrong gso type %d:expecting SKB_GSO_FCOE\n",
+		e_err(drv, "Wrong gso type %d:expecting SKB_GSO_FCOE\n",
 		      skb_shinfo(skb)->gso_type);
 		return -EINVAL;
 	}
@@ -412,7 +412,7 @@ int ixgbe_fso(struct ixgbe_adapter *adapter,
 		fcoe_sof_eof |= IXGBE_ADVTXD_FCOEF_SOF;
 		break;
 	default:
-		e_warn("unknown sof = 0x%x\n", sof);
+		e_warn(drv, "unknown sof = 0x%x\n", sof);
 		return -EINVAL;
 	}
 
@@ -439,7 +439,7 @@ int ixgbe_fso(struct ixgbe_adapter *adapter,
 		fcoe_sof_eof |= IXGBE_ADVTXD_FCOEF_EOF_A;
 		break;
 	default:
-		e_warn("unknown eof = 0x%x\n", eof);
+		e_warn(drv, "unknown eof = 0x%x\n", eof);
 		return -EINVAL;
 	}
 
@@ -515,7 +515,7 @@ void ixgbe_configure_fcoe(struct ixgbe_adapter *adapter)
 					     adapter->pdev, IXGBE_FCPTR_MAX,
 					     IXGBE_FCPTR_ALIGN, PAGE_SIZE);
 		if (!fcoe->pool)
-			e_err("failed to allocated FCoE DDP pool\n");
+			e_err(drv, "failed to allocated FCoE DDP pool\n");
 
 		spin_lock_init(&fcoe->lock);
 	}
@@ -611,7 +611,7 @@ int ixgbe_fcoe_enable(struct net_device *netdev)
 	if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
 		goto out_enable;
 
-	e_info("Enabling FCoE offload features.\n");
+	e_info(drv, "Enabling FCoE offload features.\n");
 	if (netif_running(netdev))
 		netdev->netdev_ops->ndo_stop(netdev);
 
@@ -657,7 +657,7 @@ int ixgbe_fcoe_disable(struct net_device *netdev)
 	if (!(adapter->flags & IXGBE_FLAG_FCOE_ENABLED))
 		goto out_disable;
 
-	e_info("Disabling FCoE offload features.\n");
+	e_info(drv, "Disabling FCoE offload features.\n");
 	if (netif_running(netdev))
 		netdev->netdev_ops->ndo_stop(netdev);
 
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index dd46345..8c7617b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -696,7 +696,7 @@ static inline bool ixgbe_check_tx_hang(struct ixgbe_adapter *adapter,
 		/* detected Tx unit hang */
 		union ixgbe_adv_tx_desc *tx_desc;
 		tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
-		e_err("Detected Tx Unit Hang\n"
+		e_err(drv, "Detected Tx Unit Hang\n"
 		      "  Tx Queue             <%d>\n"
 		      "  TDH, TDT             <%x>, <%x>\n"
 		      "  next_to_use          <%x>\n"
@@ -812,8 +812,8 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 	if (adapter->detect_tx_hung) {
 		if (ixgbe_check_tx_hang(adapter, tx_ring, i)) {
 			/* schedule immediate reset if we believe we hung */
-			e_info("tx hang %d detected, resetting adapter\n",
-			       adapter->tx_timeout_count + 1);
+			e_info(probe, "tx hang %d detected, resetting "
+			       "adapter\n", adapter->tx_timeout_count + 1);
 			ixgbe_tx_timeout(adapter->netdev);
 		}
 	}
@@ -1652,8 +1652,8 @@ static void ixgbe_check_overtemp_task(struct work_struct *work)
 				return;
 			break;
 		}
-		e_crit("Network adapter has been stopped because it "
-		       "has over heated. Restart the computer. If the problem "
+		e_crit(drv, "Network adapter has been stopped because it has "
+		       "over heated. Restart the computer. If the problem "
 		       "persists, power off the system and replace the "
 		       "adapter\n");
 		/* write to clear the interrupt */
@@ -1667,7 +1667,7 @@ static void ixgbe_check_fan_failure(struct ixgbe_adapter *adapter, u32 eicr)
 
 	if ((adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE) &&
 	    (eicr & IXGBE_EICR_GPI_SDP1)) {
-		e_crit("Fan has stopped, replace the adapter\n");
+		e_crit(probe, "Fan has stopped, replace the adapter\n");
 		/* write to clear the interrupt */
 		IXGBE_WRITE_REG(hw, IXGBE_EICR, IXGBE_EICR_GPI_SDP1);
 	}
@@ -2153,7 +2153,7 @@ static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
 		                  handler, 0, adapter->name[vector],
 		                  adapter->q_vector[vector]);
 		if (err) {
-			e_err("request_irq failed for MSIX interrupt: "
+			e_err(probe, "request_irq failed for MSIX interrupt "
 			      "Error: %d\n", err);
 			goto free_queue_irqs;
 		}
@@ -2163,7 +2163,7 @@ static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
 	err = request_irq(adapter->msix_entries[vector].vector,
 	                  ixgbe_msix_lsc, 0, adapter->name[vector], netdev);
 	if (err) {
-		e_err("request_irq for msix_lsc failed: %d\n", err);
+		e_err(probe, "request_irq for msix_lsc failed: %d\n", err);
 		goto free_queue_irqs;
 	}
 
@@ -2349,7 +2349,7 @@ static int ixgbe_request_irq(struct ixgbe_adapter *adapter)
 	}
 
 	if (err)
-		e_err("request_irq failed, Error %d\n", err);
+		e_err(probe, "request_irq failed, Error %d\n", err);
 
 	return err;
 }
@@ -2420,7 +2420,7 @@ static void ixgbe_configure_msi_and_legacy(struct ixgbe_adapter *adapter)
 	map_vector_to_rxq(adapter, 0, 0);
 	map_vector_to_txq(adapter, 0, 0);
 
-	e_info("Legacy interrupt IVAR setup done\n");
+	e_info(hw, "Legacy interrupt IVAR setup done\n");
 }
 
 /**
@@ -3316,7 +3316,7 @@ static inline void ixgbe_rx_desc_queue_enable(struct ixgbe_adapter *adapter,
 			msleep(1);
 	}
 	if (k >= IXGBE_MAX_RX_DESC_POLL) {
-		e_err("RXDCTL.ENABLE on Rx queue %d not set within "
+		e_err(drv, "RXDCTL.ENABLE on Rx queue %d not set within "
 		      "the polling period\n", rxr);
 	}
 	ixgbe_release_rx_desc(&adapter->hw, adapter->rx_ring[rxr],
@@ -3446,7 +3446,7 @@ static int ixgbe_up_complete(struct ixgbe_adapter *adapter)
 			} while (--wait_loop &&
 			         !(txdctl & IXGBE_TXDCTL_ENABLE));
 			if (!wait_loop)
-				e_err("Could not enable Tx Queue %d\n", j);
+				e_err(drv, "Could not enable Tx Queue %d\n", j);
 		}
 	}
 
@@ -3494,7 +3494,7 @@ static int ixgbe_up_complete(struct ixgbe_adapter *adapter)
 	if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE) {
 		u32 esdp = IXGBE_READ_REG(hw, IXGBE_ESDP);
 		if (esdp & IXGBE_ESDP_SDP1)
-			e_crit("Fan has stopped, replace the adapter\n");
+			e_crit(drv, "Fan has stopped, replace the adapter\n");
 	}
 
 	/*
@@ -3523,7 +3523,7 @@ static int ixgbe_up_complete(struct ixgbe_adapter *adapter)
 	} else {
 		err = ixgbe_non_sfp_link_config(hw);
 		if (err)
-			e_err("link_config FAILED %d\n", err);
+			e_err(probe, "link_config FAILED %d\n", err);
 	}
 
 	for (i = 0; i < adapter->num_tx_queues; i++)
@@ -3977,12 +3977,12 @@ static inline bool ixgbe_set_fcoe_queues(struct ixgbe_adapter *adapter)
 		adapter->num_tx_queues = 1;
 #ifdef CONFIG_IXGBE_DCB
 		if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
-			e_info("FCoE enabled with DCB\n");
+			e_info(probe, "FCoE enabled with DCB\n");
 			ixgbe_set_dcb_queues(adapter);
 		}
 #endif
 		if (adapter->flags & IXGBE_FLAG_RSS_ENABLED) {
-			e_info("FCoE enabled with RSS\n");
+			e_info(probe, "FCoE enabled with RSS\n");
 			if ((adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) ||
 			    (adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE))
 				ixgbe_set_fdir_queues(adapter);
@@ -4633,8 +4633,8 @@ int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)
 	}
 
 	e_dev_info("Multiqueue %s: Rx Queue count = %u, Tx Queue count = %u\n",
-	       (adapter->num_rx_queues > 1) ? "Enabled" : "Disabled",
-	       adapter->num_rx_queues, adapter->num_tx_queues);
+		   (adapter->num_rx_queues > 1) ? "Enabled" : "Disabled",
+		   adapter->num_rx_queues, adapter->num_tx_queues);
 
 	set_bit(__IXGBE_DOWN, &adapter->state);
 
@@ -4711,7 +4711,7 @@ static void ixgbe_sfp_task(struct work_struct *work)
 				  "supported module.\n");
 			unregister_netdev(adapter->netdev);
 		} else {
-			e_info("detected SFP+: %d\n", hw->phy.sfp_type);
+			e_info(probe, "detected SFP+: %d\n", hw->phy.sfp_type);
 		}
 		/* don't need this routine any more */
 		clear_bit(__IXGBE_SFP_MODULE_NOT_FOUND, &adapter->state);
@@ -4891,7 +4891,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_adapter *adapter,
 err:
 	vfree(tx_ring->tx_buffer_info);
 	tx_ring->tx_buffer_info = NULL;
-	e_err("Unable to allocate memory for the Tx descriptor ring\n");
+	e_err(probe, "Unable to allocate memory for the Tx descriptor ring\n");
 	return -ENOMEM;
 }
 
@@ -4913,7 +4913,7 @@ static int ixgbe_setup_all_tx_resources(struct ixgbe_adapter *adapter)
 		err = ixgbe_setup_tx_resources(adapter, adapter->tx_ring[i]);
 		if (!err)
 			continue;
-		e_err("Allocation for Tx Queue %u failed\n", i);
+		e_err(probe, "Allocation for Tx Queue %u failed\n", i);
 		break;
 	}
 
@@ -4938,7 +4938,8 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 	if (!rx_ring->rx_buffer_info)
 		rx_ring->rx_buffer_info = vmalloc(size);
 	if (!rx_ring->rx_buffer_info) {
-		e_err("vmalloc allocation failed for the Rx desc ring\n");
+		e_err(probe, "vmalloc allocation failed for the Rx "
+		      "descriptor ring\n");
 		goto alloc_failed;
 	}
 	memset(rx_ring->rx_buffer_info, 0, size);
@@ -4951,7 +4952,8 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 					   &rx_ring->dma, GFP_KERNEL);
 
 	if (!rx_ring->desc) {
-		e_err("Memory allocation failed for the Rx desc ring\n");
+		e_err(probe, "Memory allocation failed for the Rx "
+		      "descriptor ring\n");
 		vfree(rx_ring->rx_buffer_info);
 		goto alloc_failed;
 	}
@@ -4984,7 +4986,7 @@ static int ixgbe_setup_all_rx_resources(struct ixgbe_adapter *adapter)
 		err = ixgbe_setup_rx_resources(adapter, adapter->rx_ring[i]);
 		if (!err)
 			continue;
-		e_err("Allocation for Rx Queue %u failed\n", i);
+		e_err(probe, "Allocation for Rx Queue %u failed\n", i);
 		break;
 	}
 
@@ -5083,7 +5085,7 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu)
 	if ((new_mtu < 68) || (max_frame > IXGBE_MAX_JUMBO_FRAME_SIZE))
 		return -EINVAL;
 
-	e_info("changing MTU from %d to %d\n", netdev->mtu, new_mtu);
+	e_info(probe, "changing MTU from %d to %d\n", netdev->mtu, new_mtu);
 	/* must set new MTU before calling down or up */
 	netdev->mtu = new_mtu;
 
@@ -5598,7 +5600,7 @@ static void ixgbe_fdir_reinit_task(struct work_struct *work)
 			set_bit(__IXGBE_FDIR_INIT_DONE,
 			        &(adapter->tx_ring[i]->reinit_state));
 	} else {
-		e_err("failed to finish FDIR re-initialization, "
+		e_err(probe, "failed to finish FDIR re-initialization, "
 		      "ignored adding FDIR ATR filters\n");
 	}
 	/* Done FDIR Re-initialization, enable transmits */
@@ -5670,7 +5672,7 @@ static void ixgbe_watchdog_task(struct work_struct *work)
 				flow_tx = !!(rmcs & IXGBE_RMCS_TFCE_802_3X);
 			}
 
-			e_info("NIC Link is Up %s, Flow Control: %s\n",
+			e_info(drv, "NIC Link is Up %s, Flow Control: %s\n",
 			       (link_speed == IXGBE_LINK_SPEED_10GB_FULL ?
 			       "10 Gbps" :
 			       (link_speed == IXGBE_LINK_SPEED_1GB_FULL ?
@@ -5688,7 +5690,7 @@ static void ixgbe_watchdog_task(struct work_struct *work)
 		adapter->link_up = false;
 		adapter->link_speed = 0;
 		if (netif_carrier_ok(netdev)) {
-			e_info("NIC Link is Down\n");
+			e_info(drv, "NIC Link is Down\n");
 			netif_carrier_off(netdev);
 		}
 	}
@@ -5864,8 +5866,9 @@ static bool ixgbe_tx_csum(struct ixgbe_adapter *adapter,
 				break;
 			default:
 				if (unlikely(net_ratelimit())) {
-					e_warn("partial checksum but "
-					       "proto=%x!\n", skb->protocol);
+					e_warn(probe, "partial checksum "
+					       "but proto=%x!\n",
+					       skb->protocol);
 				}
 				break;
 			}
@@ -6475,7 +6478,7 @@ static void __devinit ixgbe_probe_vf(struct ixgbe_adapter *adapter,
 	adapter->flags |= IXGBE_FLAG_SRIOV_ENABLED;
 	err = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
 	if (err) {
-		e_err("Failed to enable PCI sriov: %d\n", err);
+		e_err(probe, "Failed to enable PCI sriov: %d\n", err);
 		goto err_novfs;
 	}
 	/* If call to enable VFs succeeded then allocate memory
@@ -6499,8 +6502,8 @@ static void __devinit ixgbe_probe_vf(struct ixgbe_adapter *adapter,
 	}
 
 	/* Oh oh */
-	e_err("Unable to allocate memory for VF Data Storage - SRIOV "
-	      "disabled\n");
+	e_err(probe, "Unable to allocate memory for VF Data Storage - "
+	      "SRIOV disabled\n");
 	pci_disable_sriov(adapter->pdev);
 
 err_novfs:
@@ -6670,7 +6673,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE) {
 		u32 esdp = IXGBE_READ_REG(hw, IXGBE_ESDP);
 		if (esdp & IXGBE_ESDP_SDP1)
-			e_crit("Fan has stopped, replace the adapter\n");
+			e_crit(probe, "Fan has stopped, replace the adapter\n");
 	}
 
 	/* reset_hw fills in the perm_addr as well */
@@ -6701,7 +6704,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 
 	ixgbe_probe_vf(adapter, ii);
 
-	netdev->features =    NETIF_F_SG |
+	netdev->features = NETIF_F_SG |
 	                   NETIF_F_IP_CSUM |
 	                   NETIF_F_HW_VLAN_TX |
 	                   NETIF_F_HW_VLAN_RX |
@@ -6854,7 +6857,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	}
 #endif
 	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
-		e_info("IOV is enabled with %d VFs\n", adapter->num_vfs);
+		e_info(probe, "IOV is enabled with %d VFs\n", adapter->num_vfs);
 		for (i = 0; i < adapter->num_vfs; i++)
 			ixgbe_vf_configuration(pdev, (i | 0x10000000));
 	}
@@ -7002,7 +7005,7 @@ static pci_ers_result_t ixgbe_io_slot_reset(struct pci_dev *pdev)
 	int err;
 
 	if (pci_enable_device_mem(pdev)) {
-		e_err("Cannot re-enable PCI device after reset.\n");
+		e_err(probe, "Cannot re-enable PCI device after reset.\n");
 		result = PCI_ERS_RESULT_DISCONNECT;
 	} else {
 		pci_set_master(pdev);
@@ -7040,7 +7043,7 @@ static void ixgbe_io_resume(struct pci_dev *pdev)
 
 	if (netif_running(netdev)) {
 		if (ixgbe_up(adapter)) {
-			e_info("ixgbe_up failed after reset\n");
+			e_info(probe, "ixgbe_up failed after reset\n");
 			return;
 		}
 	}
diff --git a/drivers/net/ixgbe/ixgbe_sriov.c b/drivers/net/ixgbe/ixgbe_sriov.c
index 6e6dee0..49661a1 100644
--- a/drivers/net/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ixgbe/ixgbe_sriov.c
@@ -185,7 +185,8 @@ int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask)
 
 	if (enable) {
 		random_ether_addr(vf_mac_addr);
-		e_info("IOV: VF %d is enabled MAC %pM\n", vfn, vf_mac_addr);
+		e_info(probe, "IOV: VF %d is enabled MAC %pM\n",
+		       vfn, vf_mac_addr);
 		/*
 		 * Store away the VF "permananet" MAC address, it will ask
 		 * for it later.
@@ -244,7 +245,7 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	if (msgbuf[0] == IXGBE_VF_RESET) {
 		unsigned char *vf_mac = adapter->vfinfo[vf].vf_mac_addresses;
 		u8 *addr = (u8 *)(&msgbuf[1]);
-		e_info("VF Reset msg received from vf %d\n", vf);
+		e_info(probe, "VF Reset msg received from vf %d\n", vf);
 		adapter->vfinfo[vf].clear_to_send = false;
 		ixgbe_vf_reset_msg(adapter, vf);
 		adapter->vfinfo[vf].clear_to_send = true;
@@ -297,7 +298,7 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 		retval = ixgbe_set_vf_vlan(adapter, add, vid, vf);
 		break;
 	default:
-		e_err("Unhandled Msg %8.8x\n", msgbuf[0]);
+		e_err(drv, "Unhandled Msg %8.8x\n", msgbuf[0]);
 		retval = IXGBE_ERR_MBX;
 		break;
 	}


^ permalink raw reply related

* [PATCH] netfilter: postpone the checksum calculation
From: Changli Gao @ 2010-07-02  6:02 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netfilter-devel, netdev, Changli Gao

netfilter: postpone the checksum calculation.

postpone the checksum calculation, then if the output NIC supports checksum
offloading, we can utlize it. And though the output NIC doesn't support
checksum offloading, but we'll mangle this packet, this can free us from
updating the checksum, as the checksum calculation occurs later.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/ipv4/netfilter/ipt_REJECT.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index f5f4a88..3d0e064 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -95,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	}
 
 	tcph->rst	= 1;
-	tcph->check	= tcp_v4_check(sizeof(struct tcphdr),
-				       niph->saddr, niph->daddr,
-				       csum_partial(tcph,
-						    sizeof(struct tcphdr), 0));
+	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+				    niph->daddr, 0);
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+	nskb->csum_start = (unsigned char *)tcph - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
 
 	addr_type = RTN_UNSPEC;
 	if (hook != NF_INET_FORWARD
@@ -115,7 +116,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 		goto free_nskb;
 
 	niph->ttl	= dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
-	nskb->ip_summed = CHECKSUM_NONE;
 
 	/* "Never happens" */
 	if (nskb->len > dst_mtu(skb_dst(nskb)))

^ permalink raw reply related

* [net-next-2.6 PATCH] igb: drop support for UDP hashing w/ RSS
From: Jeff Kirsher @ 2010-07-02  6:01 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change removes UDP from the supported protocols for RSS hashing.  The
reason for removing this protocol is because IP fragmentation was causing a
network flow to be broken into two streams, one for fragmented, and one for
non-fragmented and this in turn was causing out-of-order issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |   18 ++++++++++--------
 1 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 9cb04e2..9465617 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -2717,14 +2717,16 @@ static void igb_setup_mrqc(struct igb_adapter *adapter)
 	}
 	igb_vmm_control(adapter);
 
-	mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 |
-		 E1000_MRQC_RSS_FIELD_IPV4_TCP);
-	mrqc |= (E1000_MRQC_RSS_FIELD_IPV6 |
-		 E1000_MRQC_RSS_FIELD_IPV6_TCP);
-	mrqc |= (E1000_MRQC_RSS_FIELD_IPV4_UDP |
-		 E1000_MRQC_RSS_FIELD_IPV6_UDP);
-	mrqc |= (E1000_MRQC_RSS_FIELD_IPV6_UDP_EX |
-		 E1000_MRQC_RSS_FIELD_IPV6_TCP_EX);
+	/*
+	 * Generate RSS hash based on TCP port numbers and/or
+	 * IPv4/v6 src and dst addresses since UDP cannot be
+	 * hashed reliably due to IP fragmentation
+	 */
+	mrqc |= E1000_MRQC_RSS_FIELD_IPV4 |
+		E1000_MRQC_RSS_FIELD_IPV4_TCP |
+		E1000_MRQC_RSS_FIELD_IPV6 |
+		E1000_MRQC_RSS_FIELD_IPV6_TCP |
+		E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
 
 	wr32(E1000_MRQC, mrqc);
 }


^ permalink raw reply related

* Re: [net-next-2.6 PATCH] ixgbe: use NETIF_F_LRO
From: David Miller @ 2010-07-02  5:50 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips, sgruszka, donald.c.skidmore
In-Reply-To: <20100701235811.16456.66115.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:58:25 -0700

> From: Stanislaw Gruszka <sgruszka@redhat.com>
> 
> Both ETH_FLAG_LRO and NETIF_F_LRO have the same value, but NETIF_F_LRO
> is intended to use with netdev->features.
> 
> Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
> Acked-by: Don Skidmore <donald.c.skidmore@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 5/5] igb: Add comment
From: David Miller @ 2010-07-02  5:50 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips, gregory.v.rose
In-Reply-To: <20100701233921.16171.58140.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:39:23 -0700

> From: Greg Rose <gregory.v.rose@intel.com>
> 
> Add explanatory comment to avoid confusion when a pointer is set
> to the second word of an array instead of the customary cast of a
> pointer to the beginning of the array.
> 
> Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 4/5] igb: correct link test not being run when link is down
From: David Miller @ 2010-07-02  5:50 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips, alexander.h.duyck
In-Reply-To: <20100701233859.16171.45966.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:39:01 -0700

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> The igb online link test was always reporting pass because instead of
> checking for if_running it was checking for netif_carrier_ok.
> 
> This change corrects the test so that it is run if the interface is running
> instead of checking for netif carrier ok.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Tested-by: Emil Tantilov <emil.s.tantilov@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 3/5] igb: Fix Tx hangs seen when loading igb with max_vfs > 7.
From: David Miller @ 2010-07-02  5:50 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips, emil.s.tantilov
In-Reply-To: <20100701233838.16171.90269.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:38:40 -0700

> From: Emil Tantilov <emil.s.tantilov@intel.com>
> 
> Check the value of max_vfs at the time of assignment of vfs_allocated_count.
> 
> The previous check in igb_probe_vfs was too late as by that time the rx/tx
> rings were initialized with the wrong offset.
> 
> Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
> Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 2/5] igb: Use only a single Tx queue in SR-IOV mode
From: David Miller @ 2010-07-02  5:50 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips, stable, gregory.v.rose
In-Reply-To: <20100701233814.16171.78454.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:38:16 -0700

> From: Greg Rose <gregory.v.rose@intel.com>
> 
> The 82576 expects the second rx queue in any pool to receive L2 switch
> loop back packets sent from the second tx queue in another pool.  The
> 82576 VF driver does not enable the second rx queue so if the PF driver
> sends packets destined to a VF from its second tx queue then the VF
> driver will never see them.  In SR-IOV mode limit the number of tx queues
> used by the PF driver to one. This patch fixes a bug reported in which
> the PF cannot communciate with the VF and should be considered for 2.6.34
> stable.
> 
> CC: stable@kernel.org
> Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
> Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 1/5] igb: fix PHY config access on 82580
From: David Miller @ 2010-07-02  5:49 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips, nicholasx.d.nunley
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:37:54 -0700

> From: Nick Nunley <nicholasx.d.nunley@intel.com>
> 
> 82580 NICs can have up to 4 functions. This fixes phy accesses
> to use the correct locks for functions 2 and 3.
> 
> Signed-off-by: Nicholas Nunley <nicholasx.d.nunley@intel.com>
> Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [net-next-2.6 PATCH] x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN
From: David Miller @ 2010-07-02  5:49 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: netdev, gospo, bphilips, ak, tglx, mingo, hpa, x86,
	alexander.h.duyck
In-Reply-To: <20100701232742.15934.49030.stgit@localhost.localdomain>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 01 Jul 2010 16:28:27 -0700

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN.  It is
> based on a suggestion from Andi Kleen.  The assumption is that there are
> not any x86 cores where unaligned access is really slow, and this change
> would allow for a performance improvement to still exist on configurations
> that are not necessarily optimized for Core 2.
> 
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied, with HPA's ack.

^ permalink raw reply

* Re: [PATCH net-next-2.6] be2net: changes to properly provide phy details
From: David Miller @ 2010-07-02  5:48 UTC (permalink / raw)
  To: ajitk; +Cc: netdev
In-Reply-To: <20100701135049.GA4518@serverengines.com>

From: Ajit Khaparde <ajitk@serverengines.com>
Date: Thu, 1 Jul 2010 19:21:00 +0530

> be2net driver is currently not showing correct phy details in certain cases.
> This patch fixes it.
> 
> Signed-off-by: Ajit Khaparde <ajitk@serverengines.com>

Applied to net-next-2.6

^ permalink raw reply

* Re: [PATCH] ll_temac: add error checking to DMA init path
From: David Miller @ 2010-07-02  5:48 UTC (permalink / raw)
  To: dkirjanov; +Cc: john.linn, brian.hill, netdev
In-Reply-To: <20100701093905.GA29132@hera.kernel.org>

From: Denis Kirjanov <dkirjanov@hera.kernel.org>
Date: Thu, 1 Jul 2010 09:39:05 +0000

> Add error checking to DMA descriptor rings initialization code.
> 
> Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>

Applied to net-next-2.6

^ permalink raw reply

* Re: [PATCH 1/1] ehea: Allocate stats buffer with GFP_KERNEL
From: David Miller @ 2010-07-02  5:48 UTC (permalink / raw)
  To: brking; +Cc: ossthema, osstklei, raisch, netdev
In-Reply-To: <201006302159.o5ULx8ow025348@d01av03.pok.ibm.com>

From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 30 Jun 2010 16:59:12 -0500

> 
> Since ehea_get_stats calls ehea_h_query_ehea_port, which
> can sleep, we can also sleep when allocating a page in
> this function. This fixes some memory allocation failure
> warnings seen under low memory conditions.
> 
> Signed-off-by: Brian King <brking@linux.vnet.ibm.com>

Applied to net-next-2.6

^ permalink raw reply

* Re: Fwd: Possible bug in net/ipv4/route.c?
From: Eric Dumazet @ 2010-07-02  5:38 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki; +Cc: netdev@vger.kernel.org, linux-kernel
In-Reply-To: <4C2D53D3.6050106@linux-ipv6.org>

Le vendredi 02 juillet 2010 à 11:49 +0900, YOSHIFUJI Hideaki a écrit :
> Switch to netdev.
> 

thanks ;)

> --yoshfuji
> 
> -------- Original Message --------
> Subject: Possible bug in net/ipv4/route.c?
> Date: Thu, 1 Jul 2010 16:00:29 -0700
> From: Sol Kavy <skavy@ubicom.com>
> To: <linux-kernel@vger.kernel.org>
> CC: Greg Ren <gren@ubicom.com>, Guojun Jin <gjin@ubicom.com>,        Murat Sezgin <msezgin@ubicom.com>, Sener Ilgen <silgen@ubicom.com>
> 
> Found Linux: 2.6.28
> Arch: Ubicom32 <not yet pushed>
> Project: uCLinux based Router
> Test: Bit torrent Stress Test
> 
> Note: The top of Linus git net/ipv4/route.c appears to have the same issue.
> 

Please use < 72 char lines

> The following is a patch for clearing out IP options area in an input

>  skb during link failure processing.  Without this patch, the

>  icmp_send() can result in a call to ip_options_echo() where the 

> common buffer area of the skb is incorrectly interpreted.  Depending on the previous use of the skb->cb[], the interpreted option length values can cause stack corruption by copying more than 40 bytes to the output options.
> 
> In our case, a driver is using the skb->cb[] area to hold driver
> specific data.   The driver is not zeroing out the area after use.  I
> can see three basic solutions:
> 
> 1) Drivers are not allowed to use the skb->cb[] area at all.  Ubicom
> should modify the driver to use a different approach.
> 
> 2) The layer using skb->cb[] should clear this area after use and
> before handing the skb to another layer.  Ubicom should modify the
> driver to clear the skb->cb[] area before sending it up the line.
> 

This is the right option. If you use one word in cb[], only your driver
knows how to clear it efficiently.

> 3) Any layer that "uses" the skb->cb[] area must clear the area before
> use.  In which case, the proposed patch would fix the problem for the
> ipv4_link_failure().  I believe that this is the correct fix because I
> see ip_rcv() clears the skb->cb[] before using it.
> 

No : ip_rcv clears() skb->cb when leaving ip_rcv, not entering.

skb allocation clears whole cb[], and each layer is responsible to clear
the part it eventually dirtied.

> Can someone confirm that this is the appropriate fix?  If this is
> documented somewhere, please direct me to the documentation.
> 
> Please send email to sol@ubicom.com in addition to posting your
> response.
> 
> Thanks,
> 
> Sol Kavy/Murat Sezgin
> Ubicom, Inc.
> 
> Patch:  
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 125ee64..d13805f 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1606,6 +1606,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
> {
>         struct rtable *rt;
> 
> +       /*
> +         * Since link failure can be called with skbs from many layers (see arp)
> +         * the cb area of the skb must be cleared before use. Because the cb area 
> +         * can be formatted according to the caller layer's cb area format and it may cause
> +         * corruptions when it is handled in a different network layer.
> +         */
> +       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
>         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
>         rt = skb->rtable;
> 
> The packet is enqueud by:
> do_IRQ()->do_softirq()->__do_softirq()->net_rx_action()->ubi32_eth_napi_poll()->ubi32_eth_receive()->__vlan_hwaccel_rx()->netif_receive_skb()->br_handle_frame()->nf_hook_slow()->br_nf_pre_routing_finish()->br_nfr_pre_routing_finish_bridge()->neight_resolve_output()->__neigh_event_send().
> 
> The packet is then dequeued by: 
> do_IRQ() -> irq_exit() -> do_softirq() -> run_timer_softirq() -> neigh_timer_handler() -> arp_error_report() -> ipv4_link_failure() -> icmp_send() -> ip_options_echo().
> 
> Because the Ubicom Ethernet driver overwrites the common buffer area, the enqueued packet contains garbage when casted as an IP options data structure.  This results in ip_options_echo() miss reading the option length information and overwriting memory.  By clearing the skb->cb[] before processing the icmp_send() against the packet, we ensure that ip_options_echo() does not corrupt memory.   
> 
> 
> 



^ permalink raw reply

* [PATCH v3] netfilter: xtables target SYNPROXY
From: Changli Gao @ 2010-07-02  4:19 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Alexey Kuznetsov, Jan Engelhardt,
	Jozsef Kadlecsik, Pekka Savola (ipv6), James Morris,
	Hideaki YOSHIFUJI, netfilter-devel, netdev, Changli Gao

v3:
fix the bug it can't work with bridge.

netfilter: xtables target SYNPROXY.

This patch implements an xtables target SYNPROXY. As the connection to the
TCP server won't be established until the ACK from the client is received, it
can protect the TCP server from the SYN-flood attacks.

It works in the raw table of the PREROUTING chain, before conntracking system.
Syncookies is used, so no new state is introduced into the conntracking system.
In fact, until the first connection is established, conntracking system doesn't
see any packets. So when there is a SYN-flood attack, conntracking system won't
be busy on finding and deleting the un-assured ct.

As the SYN-packet of the second connection request is sent locally, the DNAT
rules which are in the PREROUTING chain should be moved to the OUTPUT chain.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/net/netfilter/nf_conntrack.h        |   10 
 include/net/netfilter/nf_conntrack_core.h   |   21 
 include/net/netfilter/nf_conntrack_extend.h |    2 
 include/net/tcp.h                           |    7 
 net/ipv4/syncookies.c                       |   22 
 net/ipv4/tcp_ipv4.c                         |    9 
 net/netfilter/Kconfig                       |   17 
 net/netfilter/Makefile                      |    1 
 net/netfilter/nf_conntrack_core.c           |   45 +
 net/netfilter/xt_SYNPROXY.c                 |  679 ++++++++++++++++++++++++++++
 10 files changed, 794 insertions(+), 19 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e624dae..5e6d8e4 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -311,5 +311,15 @@ do {							\
 #define MODULE_ALIAS_NFCT_HELPER(helper) \
         MODULE_ALIAS("nfct-helper-" helper)
 
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+extern unsigned int (*syn_proxy_pre_hook)(struct sk_buff *skb,
+					  struct nf_conn *ct,
+					  enum ip_conntrack_info ctinfo);
+
+extern unsigned int (*syn_proxy_post_hook)(struct sk_buff *skb,
+					   struct nf_conn *ct,
+					   enum ip_conntrack_info ctinfo);
+#endif
 #endif /* __KERNEL__ */
 #endif /* _NF_CONNTRACK_H */
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index aced085..637b404 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -54,6 +54,23 @@ nf_conntrack_find_get(struct net *net, u16 zone,
 
 extern int __nf_conntrack_confirm(struct sk_buff *skb);
 
+static inline unsigned int syn_proxy_post_call(struct sk_buff *skb,
+					       struct nf_conn *ct,
+					       enum ip_conntrack_info ctinfo)
+{
+	unsigned int ret = NF_ACCEPT;
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+	unsigned int (*syn_proxy)(struct sk_buff *, struct nf_conn *,
+				  enum ip_conntrack_info);
+	syn_proxy = rcu_dereference(syn_proxy_post_hook);
+	if (syn_proxy)
+		ret = syn_proxy(skb, ct, ctinfo);
+#endif
+
+	return ret;
+}
+
 /* Confirm a connection: returns NF_DROP if packet must be dropped. */
 static inline int nf_conntrack_confirm(struct sk_buff *skb)
 {
@@ -63,8 +80,10 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	if (ct && !nf_ct_is_untracked(ct)) {
 		if (!nf_ct_is_confirmed(ct))
 			ret = __nf_conntrack_confirm(skb);
-		if (likely(ret == NF_ACCEPT))
+		if (likely(ret == NF_ACCEPT)) {
 			nf_ct_deliver_cached_events(ct);
+			ret = syn_proxy_post_call(skb, ct, skb->nfctinfo);
+		}
 	}
 	return ret;
 }
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 32d15bd..b2ae7e9 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -11,6 +11,7 @@ enum nf_ct_ext_id {
 	NF_CT_EXT_ACCT,
 	NF_CT_EXT_ECACHE,
 	NF_CT_EXT_ZONE,
+	NF_CT_EXT_SYNPROXY,
 	NF_CT_EXT_NUM,
 };
 
@@ -19,6 +20,7 @@ enum nf_ct_ext_id {
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
 #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
 #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
+#define NF_CT_EXT_SYNPROXY_TYPE struct syn_proxy_state
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c2f96c2..06f28d3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -460,8 +460,11 @@ extern int			tcp_disconnect(struct sock *sk, int flags);
 extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
 extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 
 				    struct ip_options *opt);
-extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, 
-				     __u16 *mss);
+extern __u32 __cookie_v4_init_sequence(__be32 saddr, __be32 daddr,
+				       __be16 sport, __be16 dport, __u32 seq,
+				       __u16 *mssp);
+extern int cookie_v4_check_sequence(const struct iphdr *iph,
+				    const struct tcphdr *th, __u32 cookie);
 
 extern __u32 cookie_init_timestamp(struct request_sock *req);
 extern bool cookie_check_timestamp(struct tcp_options_received *opt, bool *);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650cace..3adcba3 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -159,26 +159,21 @@ static __u16 const msstab[] = {
  * Generate a syncookie.  mssp points to the mss, which is returned
  * rounded down to the value encoded in the cookie.
  */
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+__u32 __cookie_v4_init_sequence(__be32 saddr, __be32 daddr, __be16 sport,
+				__be16 dport, __u32 seq, __u16 *mssp)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	const struct tcphdr *th = tcp_hdr(skb);
 	int mssind;
 	const __u16 mss = *mssp;
 
-	tcp_synq_overflow(sk);
-
 	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
 		if (mss >= msstab[mssind])
 			break;
 	*mssp = msstab[mssind];
 
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
-	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
-				     th->source, th->dest, ntohl(th->seq),
+	return secure_tcp_syn_cookie(saddr, daddr, sport, dport, seq,
 				     jiffies / (HZ * 60), mssind);
 }
+EXPORT_SYMBOL(__cookie_v4_init_sequence);
 
 /*
  * This (misnamed) value is the age of syncookie which is permitted.
@@ -191,10 +186,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
  * Check if a ack sequence number is a valid syncookie.
  * Return the decoded mss if it is, or 0 if not.
  */
-static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+int cookie_v4_check_sequence(const struct iphdr *iph, const struct tcphdr *th,
+			     __u32 cookie)
 {
-	const struct iphdr *iph = ip_hdr(skb);
-	const struct tcphdr *th = tcp_hdr(skb);
 	__u32 seq = ntohl(th->seq) - 1;
 	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
 					    th->source, th->dest, seq,
@@ -203,6 +197,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 
 	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
+EXPORT_SYMBOL(cookie_v4_check_sequence);
 
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
@@ -282,7 +277,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk) ||
-	    (mss = cookie_check(skb, cookie)) == 0) {
+	    (mss = cookie_v4_check_sequence(ip_hdr(skb), tcp_hdr(skb),
+					    cookie)) == 0) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
 		goto out;
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8fa32f5..3b094c7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1332,7 +1332,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
 
 	if (want_cookie) {
-		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+		struct tcphdr *th;
+
+		tcp_synq_overflow(sk);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+		th = tcp_hdr(skb);
+		isn = __cookie_v4_init_sequence(saddr, daddr, th->source,
+						th->dest, ntohl(th->seq),
+						&req->mss);
 		req->cookie_ts = tmp_opt.tstamp_ok;
 	} else if (!isn) {
 		struct inet_peer *peer = NULL;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 413ed24..fd8ad8c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -560,6 +560,23 @@ config NETFILTER_XT_TARGET_SECMARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_SYNPROXY
+	tristate '"SYNPROXY" target support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on SYN_COOKIES
+	depends on IP_NF_RAW
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	help
+	  The SYNPROXY target allows a raw rule to specify that some TCP
+	  connections are relayed to protect the TCP servers from the SYN-flood
+	  DoS attacks. Syn cookies is used to save the initial state, so no
+	  conntrack is needed until the client side connection is established.
+	  It frees the connection tracking system from creating/deleting
+	  conntracks when SYN-flood DoS attack acts.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_TCPMSS
 	tristate '"TCPMSS" target support'
 	depends on (IPV6 || IPV6=n)
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e28420a..4e32834 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) += xt_SYNPROXY.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 16b41b4..dd85d6f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -800,6 +800,26 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	return ct;
 }
 
+static inline unsigned int syn_proxy_pre_call(int protonum, struct sk_buff *skb,
+					      struct nf_conn *ct,
+					      enum ip_conntrack_info ctinfo)
+{
+	unsigned int ret = NF_ACCEPT;
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+	unsigned int (*syn_proxy)(struct sk_buff *, struct nf_conn *,
+				  enum ip_conntrack_info);
+
+	if (protonum == IPPROTO_TCP) {
+		syn_proxy = rcu_dereference(syn_proxy_pre_hook);
+		if (syn_proxy)
+			ret = syn_proxy(skb, ct, ctinfo);
+	}
+#endif
+
+	return ret;
+}
+
 unsigned int
 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		struct sk_buff *skb)
@@ -855,8 +875,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			       l3proto, l4proto, &set_reply, &ctinfo);
 	if (!ct) {
 		/* Not valid part of a connection */
-		NF_CT_STAT_INC_ATOMIC(net, invalid);
-		ret = NF_ACCEPT;
+		ret = syn_proxy_pre_call(protonum, skb, NULL, ctinfo);
+		if (ret == NF_ACCEPT)
+			NF_CT_STAT_INC_ATOMIC(net, invalid);
 		goto out;
 	}
 
@@ -869,6 +890,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 	NF_CT_ASSERT(skb->nfct);
 
+	ret = syn_proxy_pre_call(protonum, skb, ct, ctinfo);
+	if (ret != NF_ACCEPT)
+		goto out;
 	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
 	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
@@ -1476,6 +1500,17 @@ s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 			u32 seq);
 EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
 
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+unsigned int (*syn_proxy_pre_hook)(struct sk_buff *skb, struct nf_conn *ct,
+				   enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL(syn_proxy_pre_hook);
+
+unsigned int (*syn_proxy_post_hook)(struct sk_buff *skb, struct nf_conn *ct,
+				    enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL(syn_proxy_post_hook);
+#endif
+
 int nf_conntrack_init(struct net *net)
 {
 	int ret;
@@ -1496,6 +1531,12 @@ int nf_conntrack_init(struct net *net)
 
 		/* Howto get NAT offsets */
 		rcu_assign_pointer(nf_ct_nat_offset, NULL);
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+		rcu_assign_pointer(syn_proxy_pre_hook, NULL);
+		rcu_assign_pointer(syn_proxy_post_hook, NULL);
+#endif
 	}
 	return 0;
 
diff --git a/net/netfilter/xt_SYNPROXY.c b/net/netfilter/xt_SYNPROXY.c
new file mode 100644
index 0000000..1a55f33
--- /dev/null
+++ b/net/netfilter/xt_SYNPROXY.c
@@ -0,0 +1,679 @@
+/* (C) 2010- Changli Gao <xiaosuo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * It bases on ipt_REJECT.c
+ */
+#define pr_fmt(fmt) "SYNPROXY: " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/unaligned/access_ok.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Changli Gao <xiaosuo@gmail.com>");
+MODULE_DESCRIPTION("Xtables: \"SYNPROXY\" target for IPv4");
+MODULE_ALIAS("ipt_SYNPROXY");
+
+enum {
+	TCP_SEND_FLAG_NOTRACE	= 0x1,
+	TCP_SEND_FLAG_SYNCOOKIE	= 0x2,
+	TCP_SEND_FLAG_ACK2SYN	= 0x4,
+};
+
+struct syn_proxy_state {
+	u16	seq_inited;
+	__be16	window;
+	u32	seq_diff;
+};
+
+static int get_mtu(const struct dst_entry *dst)
+{
+	int mtu;
+
+	mtu = dst_mtu(dst);
+	if (mtu)
+		return mtu;
+
+	return dst->dev ? dst->dev->mtu : 0;
+}
+
+static int get_advmss(const struct dst_entry *dst)
+{
+	int advmss;
+
+	advmss = dst_metric(dst, RTAX_ADVMSS);
+	if (advmss)
+		return advmss;
+	advmss = get_mtu(dst);
+	if (advmss)
+		return advmss - (sizeof(struct iphdr) + sizeof(struct tcphdr));
+
+	return TCP_MSS_DEFAULT;
+}
+
+static int syn_proxy_route(struct sk_buff *skb, struct net *net, u16 *pmss)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt;
+	struct flowi fl = {};
+	unsigned int type;
+	int flags = 0;
+	int err;
+	u16 mss;
+
+	type = inet_addr_type(net, iph->saddr);
+	if (type != RTN_LOCAL) {
+		type = inet_addr_type(net, iph->daddr);
+		if (type == RTN_LOCAL)
+			flags |= FLOWI_FLAG_ANYSRC;
+	}
+
+	if (type == RTN_LOCAL) {
+		fl.nl_u.ip4_u.daddr = iph->daddr;
+		fl.nl_u.ip4_u.saddr = iph->saddr;
+		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+		fl.flags = flags;
+		err = ip_route_output_key(net, &rt, &fl);
+		if (err)
+			goto out;
+
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* non-local src, find valid iif to satisfy
+		 * rp-filter when calling ip_route_input. */
+		fl.nl_u.ip4_u.daddr = iph->saddr;
+		err = ip_route_output_key(net, &rt, &fl);
+		if (err)
+			goto out;
+
+		err = ip_route_input(skb, iph->daddr, iph->saddr,
+				     RT_TOS(iph->tos), rt->dst.dev);
+		if (err) {
+			dst_release(&rt->dst);
+			goto out;
+		}
+		if (pmss) {
+			mss = get_advmss(&rt->dst);
+			if (*pmss > mss)
+				*pmss = mss;
+		}
+		dst_release(&rt->dst);
+	}
+
+	err = skb_dst(skb)->error;
+	if (!err && pmss) {
+		mss = get_advmss(skb_dst(skb));
+		if (*pmss > mss)
+			*pmss = mss;
+	}
+
+out:
+	return err;
+}
+
+static int tcp_send(__be32 src, __be32 dst, __be16 sport, __be16 dport,
+		    u32 seq, u32 ack_seq, __be16 window, u16 mss, u8 tcp_flags,
+		    u8 tos, struct net_device *dev, int flags,
+		    struct sk_buff *oskb)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct tcphdr *th;
+	int err, len;
+
+	len = sizeof(*th);
+	if (mss)
+		len += TCPOLEN_MSS;
+
+	skb = NULL;
+	/* caller must give me a large enough oskb */
+	if (oskb) {
+		unsigned char *odata = oskb->data;
+
+		if (skb_recycle_check(oskb, 0)) {
+			oskb->data = odata;
+			skb_reset_tail_pointer(oskb);
+			skb = oskb;
+			pr_debug("recycle skb\n");
+		}
+	}
+	if (!skb) {
+		skb = alloc_skb(LL_MAX_HEADER + sizeof(*iph) + len, GFP_ATOMIC);
+		if (!skb) {
+			err = -ENOMEM;
+			goto out;
+		}
+		skb_reserve(skb, LL_MAX_HEADER);
+	}
+
+	skb_reset_network_header(skb);
+	if (!(flags & TCP_SEND_FLAG_ACK2SYN) || skb != oskb) {
+		iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+		iph->version	= 4;
+		iph->ihl	= sizeof(*iph) / 4;
+		iph->tos	= tos;
+		/* tot_len is set in ip_local_out() */
+		iph->id		= 0;
+		iph->frag_off	= htons(IP_DF);
+		iph->protocol	= IPPROTO_TCP;
+		iph->saddr	= src;
+		iph->daddr	= dst;
+		th = (struct tcphdr *)skb_put(skb, len);
+		th->source	= sport;
+		th->dest	= dport;
+	} else {
+		iph = (struct iphdr *)skb->data;
+		iph->id		= 0;
+		iph->frag_off	= htons(IP_DF);
+		skb_put(skb, iph->ihl * 4 + len);
+		th = (struct tcphdr *)(skb->data + iph->ihl * 4);
+	}
+
+	th->seq		= htonl(seq);
+	th->ack_seq	= htonl(ack_seq);
+	tcp_flag_byte(th) = tcp_flags;
+	th->doff	= len / 4;
+	th->window	= window;
+	th->urg_ptr	= 0;
+
+	skb->protocol = htons(ETH_P_IP);
+	if ((flags & TCP_SEND_FLAG_SYNCOOKIE) && mss)
+		err = syn_proxy_route(skb, dev_net(dev), &mss);
+	else
+		err = syn_proxy_route(skb, dev_net(dev), NULL);
+	if (err)
+		goto err_out;
+
+	if ((flags & TCP_SEND_FLAG_SYNCOOKIE)) {
+		if (mss) {
+			th->seq = htonl(__cookie_v4_init_sequence(dst, src,
+								  dport, sport,
+								  ack_seq - 1,
+								  &mss));
+		} else {
+			mss = TCP_MSS_DEFAULT;
+			th->seq = htonl(__cookie_v4_init_sequence(dst, src,
+								  dport, sport,
+								  ack_seq - 1,
+								  &mss));
+			mss = 0;
+		}
+	}
+
+	if (mss)
+		* (__force __be32 *)(th + 1) = htonl((TCPOPT_MSS << 24) |
+						     (TCPOLEN_MSS << 16) |
+						     mss);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	th->check = ~tcp_v4_check(len, src, dst, 0);
+	skb->csum_start = (unsigned char *)th - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
+
+	if (!(flags & TCP_SEND_FLAG_ACK2SYN) || skb != oskb)
+		iph->ttl	= dst_metric(skb_dst(skb), RTAX_HOPLIMIT);
+
+	if (skb->len > get_mtu(skb_dst(skb))) {
+		if (printk_ratelimit())
+			pr_warning("%s has smaller mtu: %d\n",
+				   skb_dst(skb)->dev->name,
+				   get_mtu(skb_dst(skb)));
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if ((flags & TCP_SEND_FLAG_NOTRACE)) {
+		skb->nfct = &nf_ct_untracked_get()->ct_general;
+		skb->nfctinfo = IP_CT_NEW;
+		nf_conntrack_get(skb->nfct);
+	}
+
+	pr_debug("ip_local_out: %pI4n:%hu -> %pI4n:%hu (seq=%u, "
+		 "ack_seq=%u mss=%hu flags=%hhx)\n", &src, ntohs(th->source),
+		 &dst, ntohs(th->dest), ntohl(th->seq), ack_seq, mss,
+		 tcp_flags);
+
+	err = ip_local_out(skb);
+	if (err > 0)
+		err = net_xmit_errno(err);
+
+	pr_debug("ip_local_out: return with %d\n", err);
+out:
+	if (oskb && oskb != skb)
+		kfree_skb(oskb);
+
+	return err;
+
+err_out:
+	kfree_skb(skb);
+	goto out;
+}
+
+static int get_mss(u8 *data, int len)
+{
+	u8 olen;
+
+	while (len >= TCPOLEN_MSS) {
+		switch (data[0]) {
+		case TCPOPT_EOL:
+			return 0;
+		case TCPOPT_NOP:
+			data++;
+			len--;
+			break;
+		case TCPOPT_MSS:
+			if (data[1] != TCPOLEN_MSS)
+				return -EINVAL;
+			return get_unaligned_be16(data + 2);
+		default:
+			olen = data[1];
+			if (olen < 2 || olen > len)
+				return -EINVAL;
+			data += olen;
+			len -= olen;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static DEFINE_PER_CPU(struct syn_proxy_state, syn_proxy_state);
+
+/* syn_proxy_pre isn't under the protection of nf_conntrack_proto_tcp.c */
+static unsigned int syn_proxy_pre(struct sk_buff *skb, struct nf_conn *ct,
+				  enum ip_conntrack_info ctinfo)
+{
+	struct syn_proxy_state *state;
+	struct iphdr *iph;
+	struct tcphdr *th, _th;
+
+	/* only support IPv4 now */
+	iph = ip_hdr(skb);
+	if (iph->version != 4)
+		return NF_ACCEPT;
+
+	th = skb_header_pointer(skb, iph->ihl * 4, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	if (!ct || !nf_ct_is_confirmed(ct)) {
+		int ret;
+
+		if (!th->syn && th->ack) {
+			u16 mss;
+			struct sk_buff *rec_skb;
+
+			mss = cookie_v4_check_sequence(iph, th,
+						       ntohl(th->ack_seq) - 1);
+			if (!mss)
+				return NF_ACCEPT;
+
+			pr_debug("%pI4n:%hu -> %pI4n:%hu(mss=%hu)\n",
+				 &iph->saddr, ntohs(th->source),
+				 &iph->daddr, ntohs(th->dest), mss);
+
+			if (skb_tailroom(skb) < TCPOLEN_MSS &&
+			    skb->len < iph->ihl * 4 + sizeof(*th) + TCPOLEN_MSS)
+				rec_skb = NULL;
+			else
+				rec_skb = skb;
+
+			local_bh_disable();
+			state = &__get_cpu_var(syn_proxy_state);
+			state->seq_inited = 1;
+			state->window = th->window;
+			state->seq_diff = ntohl(th->ack_seq) - 1;
+			if (rec_skb)
+				tcp_send(iph->saddr, iph->daddr, 0, 0,
+					 ntohl(th->seq) - 1, 0, th->window,
+					 mss, TCPHDR_SYN, 0, skb->dev,
+					 TCP_SEND_FLAG_ACK2SYN, rec_skb);
+			else
+				tcp_send(iph->saddr, iph->daddr, th->source,
+					 th->dest, ntohl(th->seq) - 1, 0,
+					 th->window, mss, TCPHDR_SYN,
+					 iph->tos, skb->dev, 0, NULL);
+			state->seq_inited = 0;
+			local_bh_enable();
+
+			if (!rec_skb)
+				kfree_skb(skb);
+
+			return NF_STOLEN;
+		}
+
+		if (!ct || !th->syn || th->ack)
+			return NF_ACCEPT;
+
+		ret = NF_ACCEPT;
+		local_bh_disable();
+		state = &__get_cpu_var(syn_proxy_state);
+		if (state->seq_inited) {
+			struct syn_proxy_state *nstate;
+
+			nstate = nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY,
+					       GFP_ATOMIC);
+			if (nstate != NULL) {
+				nstate->seq_inited = 0;
+				nstate->window = state->window;
+				nstate->seq_diff = state->seq_diff;
+				pr_debug("seq_diff: %u\n", nstate->seq_diff);
+			} else {
+				ret = NF_DROP;
+			}
+		}
+		local_bh_enable();
+
+		return ret;
+	}
+
+	state = nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
+	if (!state)
+		return NF_ACCEPT;
+
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+		__be32 newack;
+
+		/* don't need to mangle duplicate SYN packets */
+		if (th->syn && !th->ack)
+			return NF_ACCEPT;
+		if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*th)))
+			return NF_DROP;
+		th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
+		newack = htonl(ntohl(th->ack_seq) - state->seq_diff);
+		inet_proto_csum_replace4(&th->check, skb, th->ack_seq, newack,
+					 0);
+		pr_debug("alter ack seq: %u -> %u\n",
+			 ntohl(th->ack_seq), ntohl(newack));
+		th->ack_seq = newack;
+	} else {
+		/* Simultaneous open ? Oh, no. The connection between
+		 * client and us is established. */
+		if (th->syn && !th->ack)
+			return NF_DROP;
+	}
+
+	return NF_ACCEPT;
+}
+
+static unsigned int syn_proxy_mangle_pkt(struct sk_buff *skb, struct iphdr *iph,
+					 struct tcphdr *th, u32 seq_diff)
+{
+	__be32 new;
+	int olen;
+
+	if (skb->len < (iph->ihl + th->doff) * 4)
+		return NF_DROP;
+	if (!skb_make_writable(skb, (iph->ihl + th->doff) * 4))
+		return NF_DROP;
+	iph = (struct iphdr *)(skb->data);
+	th = (struct tcphdr *)(skb->data + iph->ihl * 4);
+
+	new = tcp_flag_word(th) & (~TCP_FLAG_SYN);
+	inet_proto_csum_replace4(&th->check, skb, tcp_flag_word(th), new, 0);
+	tcp_flag_word(th) = new;
+
+	new = htonl(ntohl(th->seq) + seq_diff);
+	inet_proto_csum_replace4(&th->check, skb, th->seq, new, 0);
+	pr_debug("alter seq: %u -> %u\n", ntohl(th->seq), ntohl(new));
+	th->seq = new;
+
+	olen = th->doff - sizeof(*th) / 4;
+	if (olen) {
+		__be32 *opt;
+
+		opt = (__force __be32 *)(th + 1);
+#define TCPOPT_EOL_WORD ((TCPOPT_EOL << 24) + (TCPOPT_EOL << 16) + \
+			 (TCPOPT_EOL << 8) + TCPOPT_EOL)
+		inet_proto_csum_replace4(&th->check, skb, *opt, TCPOPT_EOL_WORD,
+					 0);
+		*opt = TCPOPT_EOL_WORD;
+	}
+
+	return NF_ACCEPT;
+}
+
+static unsigned int syn_proxy_post(struct sk_buff *skb, struct nf_conn *ct,
+				   enum ip_conntrack_info ctinfo)
+{
+	struct syn_proxy_state *state;
+	struct iphdr *iph;
+	struct tcphdr *th;
+
+	/* untraced packets don't have NF_CT_EXT_SYNPROXY ext, as they don't
+	 * enter syn_proxy_pre() */
+	state = nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
+	if (state == NULL)
+		return NF_ACCEPT;
+
+	iph = ip_hdr(skb);
+	if (!skb_make_writable(skb, iph->ihl * 4 + sizeof(*th)))
+		return NF_DROP;
+	th = (struct tcphdr *)(skb->data + iph->ihl * 4);
+	if (!state->seq_inited) {
+		if (th->syn) {
+			/* It must be from original direction, as the ones
+			 * from the other side are dropped in function
+			 * syn_proxy_pre() */
+			if (!th->ack)
+				return NF_ACCEPT;
+
+			pr_debug("SYN-ACK %pI4n:%hu -> %pI4n:%hu "
+				 "(seq=%u ack_seq=%u)\n",
+				 &iph->saddr, ntohs(th->source), &iph->daddr,
+				 ntohs(th->dest), ntohl(th->seq),
+				 ntohl(th->ack_seq));
+
+			/* SYN-ACK from reply direction with the protection
+			 * of conntrack */
+			spin_lock_bh(&ct->lock);
+			if (!state->seq_inited) {
+				state->seq_inited = 1;
+				pr_debug("update seq_diff %u -> %u\n",
+					 state->seq_diff,
+					 state->seq_diff - ntohl(th->seq));
+				state->seq_diff -= ntohl(th->seq);
+			}
+			spin_unlock_bh(&ct->lock);
+			tcp_send(iph->daddr, iph->saddr, th->dest, th->source,
+				 ntohl(th->ack_seq),
+				 ntohl(th->seq) + 1 + state->seq_diff,
+				 state->window, 0, TCPHDR_ACK, iph->tos,
+				 skb->dev, 0, NULL);
+
+			return syn_proxy_mangle_pkt(skb, iph, th,
+						    state->seq_diff + 1);
+		} else {
+			__be32 newseq;
+
+			if (!th->rst)
+				return NF_ACCEPT;
+			newseq = htonl(state->seq_diff + 1);
+			inet_proto_csum_replace4(&th->check, skb, th->seq,
+						 newseq, 0);
+			pr_debug("alter RST seq: %u -> %u\n",
+				 ntohl(th->seq), ntohl(newseq));
+			th->seq = newseq;
+
+			return NF_ACCEPT;
+		}
+	}
+
+	/* ct should be in ESTABLISHED state, but if the ack packets from
+	 * us are lost. */
+	if (th->syn) {
+		if (!th->ack)
+			return NF_ACCEPT;
+
+		tcp_send(iph->daddr, iph->saddr, th->dest, th->source,
+			 ntohl(th->ack_seq),
+			 ntohl(th->seq) + 1 + state->seq_diff,
+			 state->window, 0, TCPHDR_ACK, iph->tos,
+			 skb->dev, 0, NULL);
+
+		return syn_proxy_mangle_pkt(skb, iph, th, state->seq_diff + 1);
+	}
+
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+		__be32 newseq;
+
+		newseq = htonl(ntohl(th->seq) + state->seq_diff);
+		inet_proto_csum_replace4(&th->check, skb, th->seq, newseq, 0);
+		pr_debug("alter seq: %u -> %u\n", ntohl(th->seq),
+			 ntohl(newseq));
+		th->seq = newseq;
+	}
+
+	return NF_ACCEPT;
+}
+
+static unsigned int tcp_process(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	int err;
+	u16 mss;
+
+	iph = ip_hdr(skb);
+	if (iph->frag_off & htons(IP_OFFSET))
+		goto out;
+	if (!pskb_may_pull(skb, iph->ihl * 4 + sizeof(*th)))
+		goto out;
+	th = (const struct tcphdr *)(skb->data + iph->ihl * 4);
+	if ((tcp_flag_byte(th) &
+	     (TCPHDR_FIN | TCPHDR_RST | TCPHDR_ACK | TCPHDR_SYN)) != TCPHDR_SYN)
+		goto out;
+
+	if (nf_ip_checksum(skb, NF_INET_PRE_ROUTING, iph->ihl * 4, IPPROTO_TCP))
+		goto out;
+	mss = 0;
+	if (th->doff > sizeof(*th) / 4) {
+		if (!pskb_may_pull(skb, (iph->ihl + th->doff) * 4))
+			goto out;
+		err = get_mss((u8 *)(th + 1), th->doff * 4 - sizeof(*th));
+		if (err < 0)
+			goto out;
+		if (err != 0)
+			mss = err;
+	} else if (th->doff != sizeof(*th) / 4)
+		goto out;
+
+	tcp_send(iph->daddr, iph->saddr, th->dest, th->source, 0,
+		 ntohl(th->seq) + 1, 0, mss, TCPHDR_SYN | TCPHDR_ACK,
+		 iph->tos, skb->dev,
+		 TCP_SEND_FLAG_NOTRACE | TCP_SEND_FLAG_SYNCOOKIE, skb);
+
+	return NF_STOLEN;
+
+out:
+	return NF_DROP;
+}
+
+static unsigned int synproxy_tg(struct sk_buff *skb,
+				const struct xt_action_param *par)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	int ret;
+
+	/* received from lo */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct)
+		return IPT_CONTINUE;
+
+	local_bh_disable();
+	if (!__get_cpu_var(syn_proxy_state).seq_inited)
+		ret = tcp_process(skb);
+	else
+		ret = IPT_CONTINUE;
+	local_bh_enable();
+
+	return ret;
+}
+
+static int synproxy_tg_check(const struct xt_tgchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+
+	return ret;
+}
+
+static void synproxy_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg_reg __read_mostly = {
+	.name		= "SYNPROXY",
+	.family		= NFPROTO_IPV4,
+	.target		= synproxy_tg,
+	.table		= "raw",
+	.hooks		= 1 << NF_INET_PRE_ROUTING,
+	.proto		= IPPROTO_TCP,
+	.checkentry	= synproxy_tg_check,
+	.destroy	= synproxy_tg_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct nf_ct_ext_type syn_proxy_state_ext __read_mostly = {
+	.len	= sizeof(struct syn_proxy_state),
+	.align	= __alignof__(struct syn_proxy_state),
+	.id	= NF_CT_EXT_SYNPROXY,
+};
+
+static int __init synproxy_tg_init(void)
+{
+	int err;
+
+	rcu_assign_pointer(syn_proxy_pre_hook, syn_proxy_pre);
+	rcu_assign_pointer(syn_proxy_post_hook, syn_proxy_post);
+	err = nf_ct_extend_register(&syn_proxy_state_ext);
+	if (err)
+		goto err_out;
+	err = xt_register_target(&synproxy_tg_reg);
+	if (err)
+		goto err_out2;
+
+	return err;
+
+err_out2:
+	nf_ct_extend_unregister(&syn_proxy_state_ext);
+err_out:
+	rcu_assign_pointer(syn_proxy_post_hook, NULL);
+	rcu_assign_pointer(syn_proxy_pre_hook, NULL);
+	rcu_barrier();
+
+	return err;
+}
+
+static void __exit synproxy_tg_exit(void)
+{
+	xt_unregister_target(&synproxy_tg_reg);
+	nf_ct_extend_unregister(&syn_proxy_state_ext);
+	rcu_assign_pointer(syn_proxy_post_hook, NULL);
+	rcu_assign_pointer(syn_proxy_pre_hook, NULL);
+	rcu_barrier();
+}
+
+module_init(synproxy_tg_init);
+module_exit(synproxy_tg_exit);

^ permalink raw reply related

* Fwd: Possible bug in net/ipv4/route.c?
From: YOSHIFUJI Hideaki @ 2010-07-02  2:49 UTC (permalink / raw)
  To: netdev@vger.kernel.org, linux-kernel

Switch to netdev.

--yoshfuji

-------- Original Message --------
Subject: Possible bug in net/ipv4/route.c?
Date: Thu, 1 Jul 2010 16:00:29 -0700
From: Sol Kavy <skavy@ubicom.com>
To: <linux-kernel@vger.kernel.org>
CC: Greg Ren <gren@ubicom.com>, Guojun Jin <gjin@ubicom.com>,        Murat Sezgin <msezgin@ubicom.com>, Sener Ilgen <silgen@ubicom.com>

Found Linux: 2.6.28
Arch: Ubicom32 <not yet pushed>
Project: uCLinux based Router
Test: Bit torrent Stress Test

Note: The top of Linus git net/ipv4/route.c appears to have the same issue.

The following is a patch for clearing out IP options area in an input skb during link failure processing.  Without this patch, the icmp_send() can result in a call to ip_options_echo() where the common buffer area of the skb is incorrectly interpreted.  Depending on the previous use of the skb->cb[], the interpreted option length values can cause stack corruption by copying more than 40 bytes to the output options.

In our case, a driver is using the skb->cb[] area to hold driver specific data.   The driver is not zeroing out the area after use.  I can see three basic solutions:

1) Drivers are not allowed to use the skb->cb[] area at all.  Ubicom should modify the driver to use a different approach.

2) The layer using skb->cb[] should clear this area after use and before handing the skb to another layer.  Ubicom should modify the driver to clear the skb->cb[] area before sending it up the line.

3) Any layer that "uses" the skb->cb[] area must clear the area before use.  In which case, the proposed patch would fix the problem for the ipv4_link_failure().  I believe that this is the correct fix because I see ip_rcv() clears the skb->cb[] before using it.

Can someone confirm that this is the appropriate fix?  If this is documented somewhere, please direct me to the documentation.

Please send email to sol@ubicom.com in addition to posting your response.

Thanks,

Sol Kavy/Murat Sezgin
Ubicom, Inc.

Patch:  

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 125ee64..d13805f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1606,6 +1606,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
{
        struct rtable *rt;

+       /*
+         * Since link failure can be called with skbs from many layers (see arp)
+         * the cb area of the skb must be cleared before use. Because the cb area 
+         * can be formatted according to the caller layer's cb area format and it may cause
+         * corruptions when it is handled in a different network layer.
+         */
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
        rt = skb->rtable;

The packet is enqueud by:
do_IRQ()->do_softirq()->__do_softirq()->net_rx_action()->ubi32_eth_napi_poll()->ubi32_eth_receive()->__vlan_hwaccel_rx()->netif_receive_skb()->br_handle_frame()->nf_hook_slow()->br_nf_pre_routing_finish()->br_nfr_pre_routing_finish_bridge()->neight_resolve_output()->__neigh_event_send().

The packet is then dequeued by: 
do_IRQ() -> irq_exit() -> do_softirq() -> run_timer_softirq() -> neigh_timer_handler() -> arp_error_report() -> ipv4_link_failure() -> icmp_send() -> ip_options_echo().

Because the Ubicom Ethernet driver overwrites the common buffer area, the enqueued packet contains garbage when casted as an IP options data structure.  This results in ip_options_echo() miss reading the option length information and overwriting memory.  By clearing the skb->cb[] before processing the icmp_send() against the packet, we ensure that ip_options_echo() does not corrupt memory.   




^ permalink raw reply related

* Re: [net-next-2.6 PATCH] x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN
From: H. Peter Anvin @ 2010-07-02  1:15 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, netdev, gospo, bphilips, Andi Kleen, Thomas Gleixner,
	Ingo Molnar, x86, Alexander Duyck
In-Reply-To: <20100701232742.15934.49030.stgit@localhost.localdomain>

On 07/01/2010 04:28 PM, Jeff Kirsher wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN.  It is
> based on a suggestion from Andi Kleen.  The assumption is that there are
> not any x86 cores where unaligned access is really slow, and this change
> would allow for a performance improvement to still exist on configurations
> that are not necessarily optimized for Core 2.
> 
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Acked-by: H. Peter Anvin <hpa@zytor.com>

	-hpa

^ permalink raw reply

* Re: pull request: wireless-next-2.6 2010-07-01
From: David Miller @ 2010-07-02  0:35 UTC (permalink / raw)
  To: linville; +Cc: linux-wireless, netdev, linux-kernel
In-Reply-To: <20100701181526.GA2356@tuxdriver.com>

From: "John W. Linville" <linville@tuxdriver.com>
Date: Thu, 1 Jul 2010 14:15:27 -0400

> Two weeks since the last request, plenty of new stuff intended for
> 2.6.36...
> 
> Included are the usual bunch of driver updates, including a big
> dump from the rt2x00 team.  This also includes cfg80211 support
> for libertas, a flurry of (mostly trivial) stuff from me, and a
> wireless-2.6 pull to resolve some patch dependencies.
> 
> Please let me know if there are problems!

This failed to pull cleanly, I got a conflict in
drivers/net/wireless/libertas/host.h  It was the
usual "__packed" vs. "__attribute__((packed))" thing.

I resolved it but I wonder if this happened because you
did the wireless-2.6 --> wireless-next-2.6 merge here.

^ permalink raw reply

* Re: [net-next-2.6 PATCH] x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN
From: Stephen Hemminger @ 2010-07-02  0:26 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, netdev, gospo, bphilips, Andi Kleen, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, x86, Alexander Duyck
In-Reply-To: <20100701232742.15934.49030.stgit@localhost.localdomain>

On Thu, 01 Jul 2010 16:28:27 -0700
Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN.  It is
> based on a suggestion from Andi Kleen.  The assumption is that there are
> not any x86 cores where unaligned access is really slow, and this change
> would allow for a performance improvement to still exist on configurations
> that are not necessarily optimized for Core 2.
> 
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---

This is a good idea, but warnig it may end up masking broken
hardware. Developers of new drivers will end up never
exercising unaligned DMA, resulting in hardware 
that doesn't work on platforms that have NET_IP_ALIGN set
to 2.

-- 

^ permalink raw reply

* [net-next-2.6 PATCH] ixgbe: use NETIF_F_LRO
From: Jeff Kirsher @ 2010-07-01 23:58 UTC (permalink / raw)
  To: davem
  Cc: netdev, gospo, bphilips, Stanislaw Gruszka, Don Skidmore,
	Jeff Kirsher

From: Stanislaw Gruszka <sgruszka@redhat.com>

Both ETH_FLAG_LRO and NETIF_F_LRO have the same value, but NETIF_F_LRO
is intended to use with netdev->features.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Don Skidmore <donald.c.skidmore@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_ethtool.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index b50b5ea..5275e9c 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -2237,7 +2237,7 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
 				break;
 			}
 		} else if (!adapter->rx_itr_setting) {
-			netdev->features &= ~ETH_FLAG_LRO;
+			netdev->features &= ~NETIF_F_LRO;
 			if (data & ETH_FLAG_LRO)
 				e_info("rx-usecs set to 0, "
 					"LRO/RSC cannot be enabled.\n");


^ permalink raw reply related

* [net-next-2.6 PATCH 5/5] igb: Add comment
From: Jeff Kirsher @ 2010-07-01 23:39 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, Greg Rose, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>

From: Greg Rose <gregory.v.rose@intel.com>

Add explanatory comment to avoid confusion when a pointer is set
to the second word of an array instead of the customary cast of a
pointer to the beginning of the array.

Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index d811462..9cb04e2 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -4974,6 +4974,10 @@ static void igb_vf_reset_msg(struct igb_adapter *adapter, u32 vf)
 
 static int igb_set_vf_mac_addr(struct igb_adapter *adapter, u32 *msg, int vf)
 {
+	/*
+	 * The VF MAC Address is stored in a packed array of bytes
+	 * starting at the second 32 bit word of the msg array
+	 */
 	unsigned char *addr = (char *)&msg[1];
 	int err = -1;
 


^ permalink raw reply related

* [net-next-2.6 PATCH 4/5] igb: correct link test not being run when link is down
From: Jeff Kirsher @ 2010-07-01 23:39 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

The igb online link test was always reporting pass because instead of
checking for if_running it was checking for netif_carrier_ok.

This change corrects the test so that it is run if the interface is running
instead of checking for netif carrier ok.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Emil Tantilov <emil.s.tantilov@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_ethtool.c |    8 +++-----
 1 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index f2ebf92..26bf6a1 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -1823,12 +1823,10 @@ static void igb_diag_test(struct net_device *netdev,
 		dev_info(&adapter->pdev->dev, "online testing starting\n");
 
 		/* PHY is powered down when interface is down */
-		if (!netif_carrier_ok(netdev)) {
+		if (if_running && igb_link_test(adapter, &data[4]))
+			eth_test->flags |= ETH_TEST_FL_FAILED;
+		else
 			data[4] = 0;
-		} else {
-			if (igb_link_test(adapter, &data[4]))
-				eth_test->flags |= ETH_TEST_FL_FAILED;
-		}
 
 		/* Online tests aren't run; pass by default */
 		data[0] = 0;


^ permalink raw reply related

* [net-next-2.6 PATCH 3/5] igb: Fix Tx hangs seen when loading igb with max_vfs > 7.
From: Jeff Kirsher @ 2010-07-01 23:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, Emil Tantilov, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>

From: Emil Tantilov <emil.s.tantilov@intel.com>

Check the value of max_vfs at the time of assignment of vfs_allocated_count.

The previous check in igb_probe_vfs was too late as by that time the rx/tx
rings were initialized with the wrong offset.

Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |    5 +----
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index e79689e..d811462 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -2091,9 +2091,6 @@ static void __devinit igb_probe_vfs(struct igb_adapter * adapter)
 #ifdef CONFIG_PCI_IOV
 	struct pci_dev *pdev = adapter->pdev;
 
-	if (adapter->vfs_allocated_count > 7)
-		adapter->vfs_allocated_count = 7;
-
 	if (adapter->vfs_allocated_count) {
 		adapter->vf_data = kcalloc(adapter->vfs_allocated_count,
 		                           sizeof(struct vf_data_storage),
@@ -2258,7 +2255,7 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 
 #ifdef CONFIG_PCI_IOV
 	if (hw->mac.type == e1000_82576)
-		adapter->vfs_allocated_count = max_vfs;
+		adapter->vfs_allocated_count = (max_vfs > 7) ? 7 : max_vfs;
 
 #endif /* CONFIG_PCI_IOV */
 	adapter->rss_queues = min_t(u32, IGB_MAX_RX_QUEUES, num_online_cpus());


^ permalink raw reply related

* [net-next-2.6 PATCH 2/5] igb: Use only a single Tx queue in SR-IOV mode
From: Jeff Kirsher @ 2010-07-01 23:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, stable, Greg Rose, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>

From: Greg Rose <gregory.v.rose@intel.com>

The 82576 expects the second rx queue in any pool to receive L2 switch
loop back packets sent from the second tx queue in another pool.  The
82576 VF driver does not enable the second rx queue so if the PF driver
sends packets destined to a VF from its second tx queue then the VF
driver will never see them.  In SR-IOV mode limit the number of tx queues
used by the PF driver to one. This patch fixes a bug reported in which
the PF cannot communciate with the VF and should be considered for 2.6.34
stable.

CC: stable@kernel.org
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 3881918..e79689e 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -630,9 +630,6 @@ static void igb_cache_ring_register(struct igb_adapter *adapter)
 			for (; i < adapter->rss_queues; i++)
 				adapter->rx_ring[i]->reg_idx = rbase_offset +
 				                               Q_IDX_82576(i);
-			for (; j < adapter->rss_queues; j++)
-				adapter->tx_ring[j]->reg_idx = rbase_offset +
-				                               Q_IDX_82576(j);
 		}
 	case e1000_82575:
 	case e1000_82580:
@@ -996,7 +993,10 @@ static void igb_set_interrupt_capability(struct igb_adapter *adapter)
 
 	/* Number of supported queues. */
 	adapter->num_rx_queues = adapter->rss_queues;
-	adapter->num_tx_queues = adapter->rss_queues;
+	if (adapter->vfs_allocated_count)
+		adapter->num_tx_queues = 1;
+	else
+		adapter->num_tx_queues = adapter->rss_queues;
 
 	/* start with one vector for every rx queue */
 	numvecs = adapter->num_rx_queues;


^ permalink raw reply related

* [net-next-2.6 PATCH 1/5] igb: fix PHY config access on 82580
From: Jeff Kirsher @ 2010-07-01 23:37 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, Nicholas Nunley, Jeff Kirsher

From: Nick Nunley <nicholasx.d.nunley@intel.com>

82580 NICs can have up to 4 functions. This fixes phy accesses
to use the correct locks for functions 2 and 3.

Signed-off-by: Nicholas Nunley <nicholasx.d.nunley@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/e1000_82575.c   |    8 ++++++++
 drivers/net/igb/e1000_defines.h |    2 ++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/drivers/net/igb/e1000_82575.c b/drivers/net/igb/e1000_82575.c
index 86438b5..06251a9 100644
--- a/drivers/net/igb/e1000_82575.c
+++ b/drivers/net/igb/e1000_82575.c
@@ -295,6 +295,10 @@ static s32 igb_acquire_phy_82575(struct e1000_hw *hw)
 
 	if (hw->bus.func == E1000_FUNC_1)
 		mask = E1000_SWFW_PHY1_SM;
+	else if (hw->bus.func == E1000_FUNC_2)
+		mask = E1000_SWFW_PHY2_SM;
+	else if (hw->bus.func == E1000_FUNC_3)
+		mask = E1000_SWFW_PHY3_SM;
 
 	return igb_acquire_swfw_sync_82575(hw, mask);
 }
@@ -312,6 +316,10 @@ static void igb_release_phy_82575(struct e1000_hw *hw)
 
 	if (hw->bus.func == E1000_FUNC_1)
 		mask = E1000_SWFW_PHY1_SM;
+	else if (hw->bus.func == E1000_FUNC_2)
+		mask = E1000_SWFW_PHY2_SM;
+	else if (hw->bus.func == E1000_FUNC_3)
+		mask = E1000_SWFW_PHY3_SM;
 
 	igb_release_swfw_sync_82575(hw, mask);
 }
diff --git a/drivers/net/igb/e1000_defines.h b/drivers/net/igb/e1000_defines.h
index 24d9be6..90bc29d 100644
--- a/drivers/net/igb/e1000_defines.h
+++ b/drivers/net/igb/e1000_defines.h
@@ -164,6 +164,8 @@
 #define E1000_SWFW_EEP_SM   0x1
 #define E1000_SWFW_PHY0_SM  0x2
 #define E1000_SWFW_PHY1_SM  0x4
+#define E1000_SWFW_PHY2_SM  0x20
+#define E1000_SWFW_PHY3_SM  0x40
 
 /* FACTPS Definitions */
 /* Device Control */


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox