Netdev List
 help / color / mirror / Atom feed
* [net-next 6/6] ixgbevf: Update descriptor macros to accept pointers and drop _ADV suffix
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Greg Rose, Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change updates the descriptor macros to accept pointers, updates the
name to drop the _ADV suffix, and include the IXGBEVF name in the macro.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |   12 ++++++------
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index f92daca..1f13765 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -164,12 +164,12 @@ struct ixgbevf_q_vector {
 	((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
 	(R)->next_to_clean - (R)->next_to_use - 1)
 
-#define IXGBE_RX_DESC_ADV(R, i)	    \
-	(&(((union ixgbe_adv_rx_desc *)((R).desc))[i]))
-#define IXGBE_TX_DESC_ADV(R, i)	    \
-	(&(((union ixgbe_adv_tx_desc *)((R).desc))[i]))
-#define IXGBE_TX_CTXTDESC_ADV(R, i)	    \
-	(&(((struct ixgbe_adv_tx_context_desc *)((R).desc))[i]))
+#define IXGBEVF_RX_DESC(R, i)	    \
+	(&(((union ixgbe_adv_rx_desc *)((R)->desc))[i]))
+#define IXGBEVF_TX_DESC(R, i)	    \
+	(&(((union ixgbe_adv_tx_desc *)((R)->desc))[i]))
+#define IXGBEVF_TX_CTXTDESC(R, i)	    \
+	(&(((struct ixgbe_adv_tx_context_desc *)((R)->desc))[i]))
 
 #define IXGBE_MAX_JUMBO_FRAME_SIZE        16128
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 8e022c6..c98cdf7 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -195,7 +195,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 	i = tx_ring->next_to_clean;
 	eop = tx_ring->tx_buffer_info[i].next_to_watch;
-	eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
+	eop_desc = IXGBEVF_TX_DESC(tx_ring, eop);
 
 	while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
 	       (count < tx_ring->count)) {
@@ -206,7 +206,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 			goto cont_loop;
 		for ( ; !cleaned; count++) {
 			struct sk_buff *skb;
-			tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i);
+			tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 			tx_buffer_info = &tx_ring->tx_buffer_info[i];
 			cleaned = (i == eop);
 			skb = tx_buffer_info->skb;
@@ -235,7 +235,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 cont_loop:
 		eop = tx_ring->tx_buffer_info[i].next_to_watch;
-		eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
+		eop_desc = IXGBEVF_TX_DESC(tx_ring, eop);
 	}
 
 	tx_ring->next_to_clean = i;
@@ -339,7 +339,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter,
 	bi = &rx_ring->rx_buffer_info[i];
 
 	while (cleaned_count--) {
-		rx_desc = IXGBE_RX_DESC_ADV(*rx_ring, i);
+		rx_desc = IXGBEVF_RX_DESC(rx_ring, i);
 		skb = bi->skb;
 		if (!skb) {
 			skb = netdev_alloc_skb(adapter->netdev,
@@ -405,7 +405,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 
 	i = rx_ring->next_to_clean;
-	rx_desc = IXGBE_RX_DESC_ADV(*rx_ring, i);
+	rx_desc = IXGBEVF_RX_DESC(rx_ring, i);
 	staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
 	rx_buffer_info = &rx_ring->rx_buffer_info[i];
 
@@ -432,7 +432,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		if (i == rx_ring->count)
 			i = 0;
 
-		next_rxd = IXGBE_RX_DESC_ADV(*rx_ring, i);
+		next_rxd = IXGBEVF_RX_DESC(rx_ring, i);
 		prefetch(next_rxd);
 		cleaned_count++;
 
@@ -2437,7 +2437,7 @@ static int ixgbevf_tso(struct ixgbevf_adapter *adapter,
 		i = tx_ring->next_to_use;
 
 		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		context_desc = IXGBE_TX_CTXTDESC_ADV(*tx_ring, i);
+		context_desc = IXGBEVF_TX_CTXTDESC(tx_ring, i);
 
 		/* VLAN MACLEN IPLEN */
 		if (tx_flags & IXGBE_TX_FLAGS_VLAN)
@@ -2497,7 +2497,7 @@ static bool ixgbevf_tx_csum(struct ixgbevf_adapter *adapter,
 	    (tx_flags & IXGBE_TX_FLAGS_VLAN)) {
 		i = tx_ring->next_to_use;
 		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		context_desc = IXGBE_TX_CTXTDESC_ADV(*tx_ring, i);
+		context_desc = IXGBEVF_TX_CTXTDESC(tx_ring, i);
 
 		if (tx_flags & IXGBE_TX_FLAGS_VLAN)
 			vlan_macip_lens |= (tx_flags &
@@ -2700,7 +2700,7 @@ static void ixgbevf_tx_queue(struct ixgbevf_adapter *adapter,
 	i = tx_ring->next_to_use;
 	while (count--) {
 		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i);
+		tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 		tx_desc->read.buffer_addr = cpu_to_le64(tx_buffer_info->dma);
 		tx_desc->read.cmd_type_len =
 			cpu_to_le32(cmd_type_len | tx_buffer_info->length);
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 5/6] ixgbe: Cleanup logic for MRQC and MTQC configuration
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem
  Cc: Alexander Duyck, netdev, gospo, sassmann, John Fastabend,
	Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change is meant to make the code much more readable for MTQC and MRQC
configuration.

The big change is that I simplified much of the logic so that we are
essentially handling just 4 cases and their variants. In the cases where
RSS is disabled we are actually just programming the RETA table with all
1s resulting in a single queue RSS. In the case of SR-IOV I am treating
that as a subset of VMDq. This all results int he following configuration
for the hardware:
         DCB
         En       Dis
VMDq En  VMDQ/DCB VMDq/RSS
     Dis DCB/RSS  RSS

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Stephen Ko <stephen.s.ko@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  116 ++++++++++++++-----------
 1 file changed, 66 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 32c8cd6..2b4b791 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2719,8 +2719,7 @@ void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter,
 static void ixgbe_setup_mtqc(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
-	u32 rttdcs;
-	u32 reg;
+	u32 rttdcs, mtqc;
 	u8 tcs = netdev_get_num_tc(adapter->netdev);
 
 	if (hw->mac.type == ixgbe_mac_82598EB)
@@ -2732,28 +2731,32 @@ static void ixgbe_setup_mtqc(struct ixgbe_adapter *adapter)
 	IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs);
 
 	/* set transmit pool layout */
-	switch (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
-	case (IXGBE_FLAG_SRIOV_ENABLED):
-		IXGBE_WRITE_REG(hw, IXGBE_MTQC,
-				(IXGBE_MTQC_VT_ENA | IXGBE_MTQC_64VF));
-		break;
-	default:
-		if (!tcs)
-			reg = IXGBE_MTQC_64Q_1PB;
-		else if (tcs <= 4)
-			reg = IXGBE_MTQC_RT_ENA | IXGBE_MTQC_4TC_4TQ;
+	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
+		mtqc = IXGBE_MTQC_VT_ENA;
+		if (tcs > 4)
+			mtqc |= IXGBE_MTQC_RT_ENA | IXGBE_MTQC_8TC_8TQ;
+		else if (tcs > 1)
+			mtqc |= IXGBE_MTQC_RT_ENA | IXGBE_MTQC_4TC_4TQ;
+		else if (adapter->ring_feature[RING_F_RSS].indices == 4)
+			mtqc |= IXGBE_MTQC_32VF;
 		else
-			reg = IXGBE_MTQC_RT_ENA | IXGBE_MTQC_8TC_8TQ;
+			mtqc |= IXGBE_MTQC_64VF;
+	} else {
+		if (tcs > 4)
+			mtqc = IXGBE_MTQC_RT_ENA | IXGBE_MTQC_8TC_8TQ;
+		else if (tcs > 1)
+			mtqc = IXGBE_MTQC_RT_ENA | IXGBE_MTQC_4TC_4TQ;
+		else
+			mtqc = IXGBE_MTQC_64Q_1PB;
+	}
 
-		IXGBE_WRITE_REG(hw, IXGBE_MTQC, reg);
+	IXGBE_WRITE_REG(hw, IXGBE_MTQC, mtqc);
 
-		/* Enable Security TX Buffer IFG for multiple pb */
-		if (tcs) {
-			reg = IXGBE_READ_REG(hw, IXGBE_SECTXMINIFG);
-			reg |= IXGBE_SECTX_DCB;
-			IXGBE_WRITE_REG(hw, IXGBE_SECTXMINIFG, reg);
-		}
-		break;
+	/* Enable Security TX Buffer IFG for multiple pb */
+	if (tcs) {
+		u32 sectx = IXGBE_READ_REG(hw, IXGBE_SECTXMINIFG);
+		sectx |= IXGBE_SECTX_DCB;
+		IXGBE_WRITE_REG(hw, IXGBE_SECTXMINIFG, sectx);
 	}
 
 	/* re-enable the arbiter */
@@ -2886,11 +2889,18 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
 	u32 mrqc = 0, reta = 0;
 	u32 rxcsum;
 	int i, j;
-	u8 tcs = netdev_get_num_tc(adapter->netdev);
-	int maxq = adapter->ring_feature[RING_F_RSS].indices;
+	u16 rss_i = adapter->ring_feature[RING_F_RSS].indices;
+
+	if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
+		rss_i = 1;
 
-	if (tcs)
-		maxq = min(maxq, adapter->num_tx_queues / tcs);
+	/*
+	 * Program table for at least 2 queues w/ SR-IOV so that VFs can
+	 * make full use of any rings they may have.  We will use the
+	 * PSRTYPE register to control how many rings we use within the PF.
+	 */
+	if ((adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) && (rss_i < 2))
+		rss_i = 2;
 
 	/* Fill out hash function seeds */
 	for (i = 0; i < 10; i++)
@@ -2898,7 +2908,7 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
 
 	/* Fill out redirection table */
 	for (i = 0, j = 0; i < 128; i++, j++) {
-		if (j == maxq)
+		if (j == rss_i)
 			j = 0;
 		/* reta = 4-byte sliding window of
 		 * 0x00..(indices-1)(indices-1)00..etc. */
@@ -2912,35 +2922,36 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter)
 	rxcsum |= IXGBE_RXCSUM_PCSD;
 	IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum);
 
-	if (adapter->hw.mac.type == ixgbe_mac_82598EB &&
-	    (adapter->flags & IXGBE_FLAG_RSS_ENABLED)) {
-		mrqc = IXGBE_MRQC_RSSEN;
+	if (adapter->hw.mac.type == ixgbe_mac_82598EB) {
+		if (adapter->flags & IXGBE_FLAG_RSS_ENABLED)
+			mrqc = IXGBE_MRQC_RSSEN;
 	} else {
-		int mask = adapter->flags & (IXGBE_FLAG_RSS_ENABLED
-					     | IXGBE_FLAG_SRIOV_ENABLED);
-
-		switch (mask) {
-		case (IXGBE_FLAG_RSS_ENABLED):
-			if (!tcs)
-				mrqc = IXGBE_MRQC_RSSEN;
-			else if (tcs <= 4)
-				mrqc = IXGBE_MRQC_RTRSS4TCEN;
+		u8 tcs = netdev_get_num_tc(adapter->netdev);
+
+		if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
+			if (tcs > 4)
+				mrqc = IXGBE_MRQC_VMDQRT8TCEN;	/* 8 TCs */
+			else if (tcs > 1)
+				mrqc = IXGBE_MRQC_VMDQRT4TCEN;	/* 4 TCs */
+			else if (adapter->ring_feature[RING_F_RSS].indices == 4)
+				mrqc = IXGBE_MRQC_VMDQRSS32EN;
 			else
+				mrqc = IXGBE_MRQC_VMDQRSS64EN;
+		} else {
+			if (tcs > 4)
 				mrqc = IXGBE_MRQC_RTRSS8TCEN;
-			break;
-		case (IXGBE_FLAG_SRIOV_ENABLED):
-			mrqc = IXGBE_MRQC_VMDQEN;
-			break;
-		default:
-			break;
+			else if (tcs > 1)
+				mrqc = IXGBE_MRQC_RTRSS4TCEN;
+			else
+				mrqc = IXGBE_MRQC_RSSEN;
 		}
 	}
 
 	/* Perform hash on these packet types */
-	mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4
-	      | IXGBE_MRQC_RSS_FIELD_IPV4_TCP
-	      | IXGBE_MRQC_RSS_FIELD_IPV6
-	      | IXGBE_MRQC_RSS_FIELD_IPV6_TCP;
+	mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4 |
+		IXGBE_MRQC_RSS_FIELD_IPV4_TCP |
+		IXGBE_MRQC_RSS_FIELD_IPV6 |
+		IXGBE_MRQC_RSS_FIELD_IPV6_TCP;
 
 	if (adapter->flags2 & IXGBE_FLAG2_RSS_FIELD_IPV4_UDP)
 		mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_UDP;
@@ -3103,8 +3114,13 @@ static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
 	if (hw->mac.type == ixgbe_mac_82598EB)
 		return;
 
-	if (adapter->flags & IXGBE_FLAG_RSS_ENABLED)
-		psrtype |= (adapter->num_rx_queues_per_pool << 29);
+	if (adapter->flags & IXGBE_FLAG_RSS_ENABLED) {
+		int rss_i = adapter->ring_feature[RING_F_RSS].indices;
+		if (rss_i > 3)
+			psrtype |= 2 << 29;
+		else if (rss_i > 1)
+			psrtype |= 1 << 29;
+	}
 
 	for (p = 0; p < adapter->num_rx_pools; p++)
 		IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(adapter->num_vfs + p),
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 4/6] ixgbe: Update the logic for ixgbe_cache_ring_dcb and DCB RSS configuration
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem
  Cc: Alexander Duyck, netdev, gospo, sassmann, John Fastabend,
	Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change cleans up some of the logic in an attempt to try and simplify
things for how we are configuring DCB w/ RSS.

In this patch I basically did 3 things.  I updated the logic for getting
the first register index.  I applied the fact that all TCs get the same
number of queues to simplify the looping logic in caching the DCB ring
register.  Finally I updated how we configure the RQTC register to match
the fact that all TCs are assigned the same number of queues.

Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c  |   80 ++++++++++++-------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   12 ++--
 2 files changed, 42 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index c03d771..4c3822f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -42,42 +42,37 @@ static void ixgbe_get_first_reg_idx(struct ixgbe_adapter *adapter, u8 tc,
 
 	switch (hw->mac.type) {
 	case ixgbe_mac_82598EB:
-		*tx = tc << 2;
-		*rx = tc << 3;
+		/* TxQs/TC: 4	RxQs/TC: 8 */
+		*tx = tc << 2; /* 0, 4,  8, 12, 16, 20, 24, 28 */
+		*rx = tc << 3; /* 0, 8, 16, 24, 32, 40, 48, 56 */
 		break;
 	case ixgbe_mac_82599EB:
 	case ixgbe_mac_X540:
 		if (num_tcs > 4) {
-			if (tc < 3) {
-				*tx = tc << 5;
-				*rx = tc << 4;
-			} else if (tc <  5) {
-				*tx = ((tc + 2) << 4);
-				*rx = tc << 4;
-			} else if (tc < num_tcs) {
-				*tx = ((tc + 8) << 3);
-				*rx = tc << 4;
-			}
+			/*
+			 * TCs    : TC0/1 TC2/3 TC4-7
+			 * TxQs/TC:    32    16     8
+			 * RxQs/TC:    16    16    16
+			 */
+			*rx = tc << 4;
+			if (tc < 3)
+				*tx = tc << 5;		/*   0,  32,  64 */
+			else if (tc < 5)
+				*tx = (tc + 2) << 4;	/*  80,  96 */
+			else
+				*tx = (tc + 8) << 3;	/* 104, 112, 120 */
 		} else {
-			*rx =  tc << 5;
-			switch (tc) {
-			case 0:
-				*tx =  0;
-				break;
-			case 1:
-				*tx = 64;
-				break;
-			case 2:
-				*tx = 96;
-				break;
-			case 3:
-				*tx = 112;
-				break;
-			default:
-				break;
-			}
+			/*
+			 * TCs    : TC0 TC1 TC2/3
+			 * TxQs/TC:  64  32    16
+			 * RxQs/TC:  32  32    32
+			 */
+			*rx = tc << 5;
+			if (tc < 2)
+				*tx = tc << 6;		/*  0,  64 */
+			else
+				*tx = (tc + 4) << 4;	/* 96, 112 */
 		}
-		break;
 	default:
 		break;
 	}
@@ -90,25 +85,26 @@ static void ixgbe_get_first_reg_idx(struct ixgbe_adapter *adapter, u8 tc,
  * Cache the descriptor ring offsets for DCB to the assigned rings.
  *
  **/
-static inline bool ixgbe_cache_ring_dcb(struct ixgbe_adapter *adapter)
+static bool ixgbe_cache_ring_dcb(struct ixgbe_adapter *adapter)
 {
 	struct net_device *dev = adapter->netdev;
-	int i, j, k;
+	unsigned int tx_idx, rx_idx;
+	int tc, offset, rss_i, i;
 	u8 num_tcs = netdev_get_num_tc(dev);
 
-	if (!num_tcs)
+	/* verify we have DCB queueing enabled before proceeding */
+	if (num_tcs <= 1)
 		return false;
 
-	for (i = 0, k = 0; i < num_tcs; i++) {
-		unsigned int tx_s, rx_s;
-		u16 count = dev->tc_to_txq[i].count;
+	rss_i = adapter->ring_feature[RING_F_RSS].indices;
 
-		ixgbe_get_first_reg_idx(adapter, i, &tx_s, &rx_s);
-		for (j = 0; j < count; j++, k++) {
-			adapter->tx_ring[k]->reg_idx = tx_s + j;
-			adapter->rx_ring[k]->reg_idx = rx_s + j;
-			adapter->tx_ring[k]->dcb_tc = i;
-			adapter->rx_ring[k]->dcb_tc = i;
+	for (tc = 0, offset = 0; tc < num_tcs; tc++, offset += rss_i) {
+		ixgbe_get_first_reg_idx(adapter, tc, &tx_idx, &rx_idx);
+		for (i = 0; i < rss_i; i++, tx_idx++, rx_idx++) {
+			adapter->tx_ring[offset + i]->reg_idx = tx_idx;
+			adapter->rx_ring[offset + i]->reg_idx = rx_idx;
+			adapter->tx_ring[offset + i]->dcb_tc = tc;
+			adapter->rx_ring[offset + i]->dcb_tc = tc;
 		}
 	}
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7f2aa22..32c8cd6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3608,20 +3608,16 @@ static void ixgbe_configure_dcb(struct ixgbe_adapter *adapter)
 
 	/* Enable RSS Hash per TC */
 	if (hw->mac.type != ixgbe_mac_82598EB) {
-		int i;
-		u32 reg = 0;
-		u8 msb = 0;
-		u8 rss_i = adapter->netdev->tc_to_txq[0].count - 1;
+		u32 msb = 0;
+		u16 rss_i = adapter->ring_feature[RING_F_RSS].indices - 1;
 
 		while (rss_i) {
 			msb++;
 			rss_i >>= 1;
 		}
 
-		for (i = 0; i < MAX_TRAFFIC_CLASS; i++)
-			reg |= msb << IXGBE_RQTC_SHIFT_TC(i);
-
-		IXGBE_WRITE_REG(hw, IXGBE_RQTC, reg);
+		/* write msb to all 8 TCs in one write */
+		IXGBE_WRITE_REG(hw, IXGBE_RQTC, msb * 0x11111111);
 	}
 }
 #endif
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 3/6] ixgbe: Move configuration of set_real_num_rx/tx_queues into open
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

It makes much more sense for us to configure the real number of Tx and Rx
queues in the ixgbe_open call than it does in ixgbe_set_num_queues.  By
setting the number in ixgbe_open we can avoid a number of unecessary
updates and only have to make the calls once.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c  |   58 ++++++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   36 ++++++++++-----
 2 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index d308e71..c03d771 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -349,7 +349,7 @@ static bool ixgbe_set_rss_queues(struct ixgbe_adapter *adapter)
  * fallthrough conditions.
  *
  **/
-static int ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
+static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
 {
 	/* Start with base case */
 	adapter->num_rx_queues = 1;
@@ -358,29 +358,14 @@ static int ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
 	adapter->num_rx_queues_per_pool = 1;
 
 	if (ixgbe_set_sriov_queues(adapter))
-		goto done;
+		return;
 
 #ifdef CONFIG_IXGBE_DCB
 	if (ixgbe_set_dcb_queues(adapter))
-		goto done;
+		return;
 
 #endif
-	if (ixgbe_set_rss_queues(adapter))
-		goto done;
-
-	/* fallback to base case */
-	adapter->num_rx_queues = 1;
-	adapter->num_tx_queues = 1;
-
-done:
-	if ((adapter->netdev->reg_state == NETREG_UNREGISTERED) ||
-	    (adapter->netdev->reg_state == NETREG_UNREGISTERING))
-		return 0;
-
-	/* Notify the stack of the (possibly) reduced queue counts. */
-	netif_set_real_num_tx_queues(adapter->netdev, adapter->num_tx_queues);
-	return netif_set_real_num_rx_queues(adapter->netdev,
-					    adapter->num_rx_queues);
+	ixgbe_set_rss_queues(adapter);
 }
 
 static void ixgbe_acquire_msix_vectors(struct ixgbe_adapter *adapter,
@@ -710,11 +695,10 @@ static void ixgbe_reset_interrupt_capability(struct ixgbe_adapter *adapter)
  * Attempt to configure the interrupts using the best available
  * capabilities of the hardware and the kernel.
  **/
-static int ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
+static void ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
-	int err = 0;
-	int vector, v_budget;
+	int vector, v_budget, err;
 
 	/*
 	 * It's easy to be greedy for MSI-X vectors, but it really
@@ -747,7 +731,7 @@ static int ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
 		ixgbe_acquire_msix_vectors(adapter, v_budget);
 
 		if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED)
-			goto out;
+			return;
 	}
 
 	adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;
@@ -762,25 +746,17 @@ static int ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
 	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
 		ixgbe_disable_sriov(adapter);
 
-	err = ixgbe_set_num_queues(adapter);
-	if (err)
-		return err;
-
+	ixgbe_set_num_queues(adapter);
 	adapter->num_q_vectors = 1;
 
 	err = pci_enable_msi(adapter->pdev);
-	if (!err) {
-		adapter->flags |= IXGBE_FLAG_MSI_ENABLED;
-	} else {
+	if (err) {
 		netif_printk(adapter, hw, KERN_DEBUG, adapter->netdev,
 			     "Unable to allocate MSI interrupt, "
 			     "falling back to legacy.  Error: %d\n", err);
-		/* reset err */
-		err = 0;
+		return;
 	}
-
-out:
-	return err;
+	adapter->flags |= IXGBE_FLAG_MSI_ENABLED;
 }
 
 /**
@@ -798,15 +774,10 @@ int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)
 	int err;
 
 	/* Number of supported queues */
-	err = ixgbe_set_num_queues(adapter);
-	if (err)
-		return err;
+	ixgbe_set_num_queues(adapter);
 
-	err = ixgbe_set_interrupt_capability(adapter);
-	if (err) {
-		e_dev_err("Unable to setup interrupt capabilities\n");
-		goto err_set_interrupt;
-	}
+	/* Set interrupt mode */
+	ixgbe_set_interrupt_capability(adapter);
 
 	err = ixgbe_alloc_q_vectors(adapter);
 	if (err) {
@@ -826,7 +797,6 @@ int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)
 
 err_alloc_q_vectors:
 	ixgbe_reset_interrupt_capability(adapter);
-err_set_interrupt:
 	return err;
 }
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 373342f..7f2aa22 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4798,10 +4798,26 @@ static int ixgbe_open(struct net_device *netdev)
 	if (err)
 		goto err_req_irq;
 
+	/* Notify the stack of the actual queue counts. */
+	err = netif_set_real_num_tx_queues(netdev,
+					   adapter->num_rx_pools > 1 ? 1 :
+					   adapter->num_tx_queues);
+	if (err)
+		goto err_set_queues;
+
+
+	err = netif_set_real_num_rx_queues(netdev,
+					   adapter->num_rx_pools > 1 ? 1 :
+					   adapter->num_rx_queues);
+	if (err)
+		goto err_set_queues;
+
 	ixgbe_up_complete(adapter);
 
 	return 0;
 
+err_set_queues:
+	ixgbe_free_irq(adapter);
 err_req_irq:
 	ixgbe_free_all_rx_resources(adapter);
 err_setup_rx:
@@ -4864,23 +4880,19 @@ static int ixgbe_resume(struct pci_dev *pdev)
 
 	pci_wake_from_d3(pdev, false);
 
-	rtnl_lock();
-	err = ixgbe_init_interrupt_scheme(adapter);
-	rtnl_unlock();
-	if (err) {
-		e_dev_err("Cannot initialize interrupts for device\n");
-		return err;
-	}
-
 	ixgbe_reset(adapter);
 
 	IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0);
 
-	if (netif_running(netdev)) {
+	rtnl_lock();
+	err = ixgbe_init_interrupt_scheme(adapter);
+	if (!err && netif_running(netdev))
 		err = ixgbe_open(netdev);
-		if (err)
-			return err;
-	}
+
+	rtnl_unlock();
+
+	if (err)
+		return err;
 
 	netif_device_attach(netdev);
 
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 2/6] ixgbe: Handle failures in the ixgbe_setup_rx/tx_resources calls
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Previously we were exiting without cleaning up the memory internally on the
ixgbe_setup_rx_resources and ixgbe_setup_tx_resources calls.  Instead of
forcing the caller to clean things up for us we should instead just unwind
the rings and free the memory as we go.  This way we can more gracefully
clean up the rings in the event of an allocation failure.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 17f46f0..373342f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4549,10 +4549,16 @@ static int ixgbe_setup_all_tx_resources(struct ixgbe_adapter *adapter)
 		err = ixgbe_setup_tx_resources(adapter->tx_ring[i]);
 		if (!err)
 			continue;
+
 		e_err(probe, "Allocation for Tx Queue %u failed\n", i);
-		break;
+		goto err_setup_tx;
 	}
 
+	return 0;
+err_setup_tx:
+	/* rewind the index freeing the rings as we go */
+	while (i--)
+		ixgbe_free_tx_resources(adapter->tx_ring[i]);
 	return err;
 }
 
@@ -4627,10 +4633,16 @@ static int ixgbe_setup_all_rx_resources(struct ixgbe_adapter *adapter)
 		err = ixgbe_setup_rx_resources(adapter->rx_ring[i]);
 		if (!err)
 			continue;
+
 		e_err(probe, "Allocation for Rx Queue %u failed\n", i);
-		break;
+		goto err_setup_rx;
 	}
 
+	return 0;
+err_setup_rx:
+	/* rewind the index freeing the rings as we go */
+	while (i--)
+		ixgbe_free_rx_resources(adapter->rx_ring[i]);
 	return err;
 }
 
@@ -4791,10 +4803,10 @@ static int ixgbe_open(struct net_device *netdev)
 	return 0;
 
 err_req_irq:
-err_setup_rx:
 	ixgbe_free_all_rx_resources(adapter);
-err_setup_tx:
+err_setup_rx:
 	ixgbe_free_all_tx_resources(adapter);
+err_setup_tx:
 	ixgbe_reset(adapter);
 
 	return err;
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 0/6][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

This series contains updates to ixgbe & ixgbevf.

The following are changes since commit 5abf7f7e0f6bdbfcac737f636497d7016d9507eb:
  ipv4: fix rcu splat
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next master

Alexander Duyck (6):
  ixgbe: Ping the VFs on link status change to trigger link change
  ixgbe: Handle failures in the ixgbe_setup_rx/tx_resources calls
  ixgbe: Move configuration of set_real_num_rx/tx_queues into open
  ixgbe: Update the logic for ixgbe_cache_ring_dcb and DCB RSS
    configuration
  ixgbe: Cleanup logic for MRQC and MTQC configuration
  ixgbevf: Update descriptor macros to accept pointers and drop _ADV
    suffix

 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c      |  138 ++++++---------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  190 +++++++++++++--------
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |   12 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   18 +-
 4 files changed, 183 insertions(+), 175 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* [net-next 1/6] ixgbe: Ping the VFs on link status change to trigger link change
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

When the link status changes on the PF we need to notify the VFs. In order
to do this we should ping all of the VFs in order to trigger a link status
change on them as well.

This fixes issues in which the PF would reset, but the VF didn't because the
NAK flag was not set in the VF mailbox.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ee230f5..17f46f0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5390,6 +5390,9 @@ static void ixgbe_watchdog_link_is_up(struct ixgbe_adapter *adapter)
 
 	netif_carrier_on(netdev);
 	ixgbe_check_vf_rate_limit(adapter);
+
+	/* ping all the active vfs to let them know link has changed */
+	ixgbe_ping_all_vfs(adapter);
 }
 
 /**
@@ -5419,6 +5422,9 @@ static void ixgbe_watchdog_link_is_down(struct ixgbe_adapter *adapter)
 
 	e_info(drv, "NIC Link is Down\n");
 	netif_carrier_off(netdev);
+
+	/* ping all the active vfs to let them know link has changed */
+	ixgbe_ping_all_vfs(adapter);
 }
 
 /**
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH v2] b44: add 64 bit stats
From: Kevin Groeneveld @ 2012-07-18  2:02 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20120716.230806.242760837075045729.davem@davemloft.net>

On Tue, Jul 17, 2012 at 2:08 AM, David Miller <davem@davemloft.net> wrote:
> This patch was corrupted by your email client and is therefore
> unusable.

If I resend the patch should I bump the version number in the subject?

Should I include "Acked-by" lines that people have posted?

I keep sending myself test messages with the patch but the white space
is always mangled.  I am not sure if Thunderbird is mangling it in the
sent message or the received message... :(

^ permalink raw reply

* Re: [RFC PATCH] net: cgroup: null ptr dereference in netprio cgroup during init
From: Gao feng @ 2012-07-18  1:59 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, nhorman, mark.d.rustad, netdev, eric.dumazet
In-Reply-To: <20120718003316.2979.49278.stgit@jf-dev1-dcblab>

于 2012年07月18日 08:33, John Fastabend 写道:
> When the netprio cgroup is built in the kernel cgroup_init will call
> cgrp_create which eventually calls update_netdev_tables. This is
> being called before do_initcalls() so a null ptr dereference occurs
> on init_net.
> 
> This patch adds a check on init_net.count to verify the structure
> has been initialized. The failure was introduced here,
> 
> commit ef209f15980360f6945873df3cd710c5f62f2a3e
> Author: Gao feng <gaofeng@cn.fujitsu.com>
> Date:   Wed Jul 11 21:50:15 2012 +0000
> 
>     net: cgroup: fix access the unallocated memory in netprio cgroup
> 
> Tested with ping with netprio_cgroup as a module and built in.
> 
> Marked RFC for now I think DaveM might have a reason why this needs
> some improvement.
> 
> Reported-by: Mark Rustad <mark.d.rustad@intel.com>
> Cc: Neil Horman <nhorman@tuxdriver.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Gao feng <gaofeng@cn.fujitsu.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
> 
>  net/core/netprio_cgroup.c |    3 +++
>  1 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
> index b2e9caa..e9fd7fd 100644
> --- a/net/core/netprio_cgroup.c
> +++ b/net/core/netprio_cgroup.c
> @@ -116,6 +116,9 @@ static int update_netdev_tables(void)
>  	u32 max_len;
>  	struct netprio_map *map;


Thanks John.
It's my mistake.

Can we make sure init_net.count is zero here?
I can't find some places to initialize it to zero.

>  
> +	if (!atomic_read(&init_net.count))
> +		return ret;
> +
>  	rtnl_lock();
>  	max_len = atomic_read(&max_prioidx) + 1;
>  	for_each_netdev(&init_net, dev) {
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH 0/5] Long term PMTU/redirect storage in ipv4.
From: Julian Anastasov @ 2012-07-18  1:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120717.150920.1324071045620152376.davem@davemloft.net>


	Hello,

On Tue, 17 Jul 2012, David Miller wrote:

> From: Julian Anastasov <ja@ssi.bg>
> Date: Wed, 18 Jul 2012 01:14:04 +0300 (EEST)
> 
> > 	Aha, I see. Something around fnhe_oldest() and its
> > daddr arg does not look good. If the goal is to hijack
> > some entry, probably for another daddr and comparing it with
> > tcpm_new(), may be we should remove this daddr arg and fully
> > reset all parameters such as fnhe_pmtu, fnhe_gw, fnhe_expires
> > because the find_or_create_fnhe() callers modify only specific
> > fields, we should not end up with wrong gateway inherited from
> > another daddr, for example.
> 
> Better would be to use a seqlock when reading it's values.
> 
> Either way, patches welcome :-)

	I created patch with seqlock usage. This version
is with global seqlock because I'm not sure if 2048 locks
per NH are good idea. This is only compile tested.
After comments may be I have to resubmit in separate message.


Subject: [PATCH] ipv4: use seqlock for nh_exceptions

From: Julian Anastasov <ja@ssi.bg>

	Use global seqlock for the nh_exceptions. Call
fnhe_oldest with the right hash chain. Correct the diff
value for dst_set_expires.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_fib.h |    2 +-
 net/ipv4/route.c     |  117 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e9ee1ca..2daf096 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -51,7 +51,7 @@ struct fib_nh_exception {
 	struct fib_nh_exception __rcu	*fnhe_next;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
-	u32				fnhe_gw;
+	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	unsigned long			fnhe_stamp;
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e702..e037c73 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1334,8 +1334,9 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 }
 
 static DEFINE_SPINLOCK(fnhe_lock);
+static DEFINE_SEQLOCK(fnhe_seqlock);
 
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
 	struct fib_nh_exception *fnhe, *oldest;
 
@@ -1358,47 +1359,76 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 	return hval & (FNHE_HASH_SIZE - 1);
 }
 
-static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+				  u32 pmtu, unsigned long expires)
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
 	struct fib_nh_exception *fnhe;
 	int depth;
 	u32 hval;
 
-	if (!hash) {
-		hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
-						   GFP_ATOMIC);
-		if (!hash)
-			return NULL;
-	}
+	if (!hash)
+		goto start;
 
+repeat:
 	hval = fnhe_hashfun(daddr);
 	hash += hval;
 
 	depth = 0;
+	write_seqlock_bh(&fnhe_seqlock);
 	for (fnhe = rcu_dereference(hash->chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 		if (fnhe->fnhe_daddr == daddr)
-			goto out;
+			break;
 		depth++;
 	}
 
-	if (depth > FNHE_RECLAIM_DEPTH) {
-		fnhe = fnhe_oldest(hash + hval, daddr);
-		goto out_daddr;
-	}
-	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
-	if (!fnhe)
-		return NULL;
+	if (fnhe) {
+		if (gw)
+			fnhe->fnhe_gw = gw;
+		if (pmtu) {
+			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_expires = expires;
+		}
+	} else {
+		if (depth > FNHE_RECLAIM_DEPTH)
+			fnhe = fnhe_oldest(hash);
+		else {
+			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+			if (!fnhe) {
+				write_sequnlock_bh(&fnhe_seqlock);
+				return;
+			}
 
-	fnhe->fnhe_next = hash->chain;
-	rcu_assign_pointer(hash->chain, fnhe);
+			fnhe->fnhe_next = hash->chain;
+			rcu_assign_pointer(hash->chain, fnhe);
+		}
+		fnhe->fnhe_daddr = daddr;
+		fnhe->fnhe_gw = gw;
+		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_expires = expires;
+	}
 
-out_daddr:
-	fnhe->fnhe_daddr = daddr;
-out:
 	fnhe->fnhe_stamp = jiffies;
-	return fnhe;
+	write_sequnlock_bh(&fnhe_seqlock);
+	return;
+
+start:
+	spin_lock_bh(&fnhe_lock);
+	hash = nh->nh_exceptions;
+	if (hash) {
+		spin_unlock_bh(&fnhe_lock);
+		goto repeat;
+	}
+	nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
+				    GFP_ATOMIC);
+	if (!nh->nh_exceptions) {
+		spin_unlock_bh(&fnhe_lock);
+		return;
+	}
+	hash = nh->nh_exceptions;
+	spin_unlock_bh(&fnhe_lock);
+	goto repeat;
 }
 
 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
@@ -1452,13 +1482,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		} else {
 			if (fib_lookup(net, fl4, &res) == 0) {
 				struct fib_nh *nh = &FIB_RES_NH(res);
-				struct fib_nh_exception *fnhe;
 
-				spin_lock_bh(&fnhe_lock);
-				fnhe = find_or_create_fnhe(nh, fl4->daddr);
-				if (fnhe)
-					fnhe->fnhe_gw = new_gw;
-				spin_unlock_bh(&fnhe_lock);
+				update_or_create_fnhe(nh, fl4->daddr, new_gw,
+						      0, 0);
 			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
@@ -1663,15 +1689,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 
 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
-		struct fib_nh_exception *fnhe;
 
-		spin_lock_bh(&fnhe_lock);
-		fnhe = find_or_create_fnhe(nh, fl4->daddr);
-		if (fnhe) {
-			fnhe->fnhe_pmtu = mtu;
-			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
-		}
-		spin_unlock_bh(&fnhe_lock);
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+				      jiffies + ip_rt_mtu_expires);
 	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
@@ -1898,6 +1918,7 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 {
 	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
 	struct fib_nh_exception *fnhe;
+	unsigned int seq;
 	u32 hval;
 
 	hval = fnhe_hashfun(daddr);
@@ -1905,17 +1926,29 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 		if (fnhe->fnhe_daddr == daddr) {
-			if (fnhe->fnhe_pmtu) {
-				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = jiffies - expires;
+			__be32 fnhe_daddr, gw;
+			u32 pmtu;
+			unsigned long expires;
+
+			do {
+				seq = read_seqbegin(&fnhe_seqlock);
+				fnhe_daddr = fnhe->fnhe_daddr;
+				gw = fnhe->fnhe_gw;
+				pmtu = fnhe->fnhe_pmtu;
+				expires = fnhe->fnhe_expires;
+			} while (read_seqretry(&fnhe_seqlock, seq));
+			if (daddr != fnhe_daddr)
+				break;
+			if (pmtu) {
+				unsigned long diff = expires - jiffies;
 
 				if (time_before(jiffies, expires)) {
-					rt->rt_pmtu = fnhe->fnhe_pmtu;
+					rt->rt_pmtu = pmtu;
 					dst_set_expires(&rt->dst, diff);
 				}
 			}
-			if (fnhe->fnhe_gw)
-				rt->rt_gateway = fnhe->fnhe_gw;
+			if (gw)
+				rt->rt_gateway = gw;
 			fnhe->fnhe_stamp = jiffies;
 			break;
 		}
-- 
1.7.3.4

^ permalink raw reply related

* [RFC PATCH] net: cgroup: null ptr dereference in netprio cgroup during init
From: John Fastabend @ 2012-07-18  0:33 UTC (permalink / raw)
  To: davem, gaofeng, nhorman; +Cc: mark.d.rustad, netdev, eric.dumazet

When the netprio cgroup is built in the kernel cgroup_init will call
cgrp_create which eventually calls update_netdev_tables. This is
being called before do_initcalls() so a null ptr dereference occurs
on init_net.

This patch adds a check on init_net.count to verify the structure
has been initialized. The failure was introduced here,

commit ef209f15980360f6945873df3cd710c5f62f2a3e
Author: Gao feng <gaofeng@cn.fujitsu.com>
Date:   Wed Jul 11 21:50:15 2012 +0000

    net: cgroup: fix access the unallocated memory in netprio cgroup

Tested with ping with netprio_cgroup as a module and built in.

Marked RFC for now I think DaveM might have a reason why this needs
some improvement.

Reported-by: Mark Rustad <mark.d.rustad@intel.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 net/core/netprio_cgroup.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index b2e9caa..e9fd7fd 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -116,6 +116,9 @@ static int update_netdev_tables(void)
 	u32 max_len;
 	struct netprio_map *map;
 
+	if (!atomic_read(&init_net.count))
+		return ret;
+
 	rtnl_lock();
 	max_len = atomic_read(&max_prioidx) + 1;
 	for_each_netdev(&init_net, dev) {

^ permalink raw reply related

* Re: [GIT PULL net] IPVS
From: Simon Horman @ 2012-07-18  0:01 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer
In-Reply-To: <20120717101531.GD3812@1984>

On Tue, Jul 17, 2012 at 12:15:31PM +0200, Pablo Neira Ayuso wrote:
> On Tue, Jul 10, 2012 at 03:05:03PM +0200, Pablo Neira Ayuso wrote:
> > Hi Simon,
> > 
> > On Tue, Jul 10, 2012 at 06:20:03PM +0900, Simon Horman wrote:
> > > On Mon, Apr 30, 2012 at 11:27:22AM +0200, Pablo Neira Ayuso wrote:
> > > > On Fri, Apr 27, 2012 at 09:53:54AM +0900, Simon Horman wrote:
> > > > > Hi Pablo,
> > > > > 
> > > > > please consider the following 5 changes for 3.4, they are all bug fixes.
> > > > > I would also like these changes considered for stable.
> > > > 
> > > > Please, ping me again once these have hit Linus tree to ask for
> > > > -stable submission.
> > > 
> > > Sorry for letting this slip through the cracks.
> > > 
> > > Please consider the following commits which are in Linus's tree for stable.
> > > Or I can submit them directly if that is easier.
> > > 
> > > There are 7 patches listed below. The first 5 were the patches in this
> > > pull request. The last two were patches in a git pull request
> > > a few days earlier.
> > 
> > That's fine, I can make it, but you have to include what stable
> > releases this will be applied, eg. patch 1 to releases 3.4 and 3.2.
> > 
> > I think -stable maintainers will ask for that.
> 
> Ping?

Sorry, I haven't got to this yet.

> > > commit 8537de8a7ab6681cc72fb0411ab1ba7fdba62dd0
> > > Author: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > > Date:   Thu Apr 26 07:47:44 2012 +0200
> > > 
> > >     ipvs: kernel oops - do_ip_vs_get_ctl
> > >     
> > >     Change order of init so netns init is ready
> > >     when register ioctl and netlink.
> > >     
> > >     Ver2
> > >     	Whitespace fixes and __init added.
> > >     
> > >     Reported-by: "Ryan O'Hara" <rohara@redhat.com>
> > >     Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > >     Acked-by: Julian Anastasov <ja@ssi.bg>
> > >     Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > > 
> > > commit 582b8e3eadaec77788c1aa188081a8d5059c42a6
> > > Author: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > > Date:   Thu Apr 26 09:45:35 2012 +0200
> > > 
> > >     ipvs: take care of return value from protocol init_netns
> > >     
> > >     ip_vs_create_timeout_table() can return NULL
> > >     All functions protocol init_netns is affected of this patch.
> > >     
> > >     Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > >     Acked-by: Julian Anastasov <ja@ssi.bg>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > > 
> > > commit 4b984cd50bc1b6d492175cd77bfabb78e76ffa67
> > > Author: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > > Date:   Thu Apr 26 09:45:34 2012 +0200
> > > 
> > >     ipvs: null check of net->ipvs in lblc(r) shedulers
> > >     
> > >     Avoid crash when registering shedulers after
> > >     the IPVS core initialization for netns fails. Do this by
> > >     checking for present core (net->ipvs).
> > >     
> > >     Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > >     Acked-by: Julian Anastasov <ja@ssi.bg>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > > 
> > > commit 39f618b4fd95ae243d940ec64c961009c74e3333
> > > Author: Julian Anastasov <ja@ssi.bg>
> > > Date:   Wed Apr 25 00:29:58 2012 +0300
> > > 
> > >     ipvs: reset ipvs pointer in netns
> > >     
> > >     	Make sure net->ipvs is reset on netns cleanup or failed
> > >     initialization. It is needed for IPVS applications to know that
> > >     IPVS core is not loaded in netns.
> > >     
> > >     Signed-off-by: Julian Anastasov <ja@ssi.bg>
> > >     Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > > 
> > > commit 8d08d71ce59438a6ef06be5db07966e0c144b74e
> > > Author: Julian Anastasov <ja@ssi.bg>
> > > Date:   Wed Apr 25 00:29:59 2012 +0300
> > > 
> > >     ipvs: add check in ftp for initialized core
> > >     
> > >     	Avoid crash when registering ip_vs_ftp after
> > >     the IPVS core initialization for netns fails. Do this by
> > >     checking for present core (net->ipvs).
> > >     
> > >     Signed-off-by: Julian Anastasov <ja@ssi.bg>
> > >     Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > > 
> > > commit 8f9b9a2fad47af27e14b037395e03cd8278d96d7
> > > Author: Julian Anastasov <ja@ssi.bg>
> > > Date:   Fri Apr 13 18:08:43 2012 +0300
> > > 
> > >     ipvs: fix crash in ip_vs_control_net_cleanup on unload
> > >     
> > >     	commit 14e405461e664b777e2a5636e10b2ebf36a686ec (2.6.39)
> > >     ("Add __ip_vs_control_{init,cleanup}_sysctl()")
> > >     introduced regression due to wrong __net_init for
> > >     __ip_vs_control_cleanup_sysctl. This leads to crash when
> > >     the ip_vs module is unloaded.
> > >     
> > >     	Fix it by changing __net_init to __net_exit for
> > >     the function that is already renamed to ip_vs_control_net_cleanup_sysctl.
> > >     
> > >     Signed-off-by: Julian Anastasov <ja@ssi.bg>
> > >     Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > >     Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > 
> > > commit 7118c07a844d367560ee91adb2071bde2fabcdbf
> > > Author: Sasha Levin <levinsasha928@gmail.com>
> > > Date:   Sat Apr 14 12:37:46 2012 -0400
> > > 
> > >     ipvs: Verify that IP_VS protocol has been registered
> > >     
> > >     The registration of a protocol might fail, there were no checks
> > >     and all registrations were assumed to be correct. This lead to
> > >     NULL ptr dereferences when apps tried registering.
> > >     
> > >     For example:
> > >     
> > >     [ 1293.226051] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
> > >     [ 1293.227038] IP: [<ffffffff822aacb0>] tcp_register_app+0x60/0xb0
> > >     [ 1293.227038] PGD 391de067 PUD 6c20b067 PMD 0
> > >     [ 1293.227038] Oops: 0000 [#1] PREEMPT SMP
> > >     [ 1293.227038] CPU 1
> > >     [ 1293.227038] Pid: 19609, comm: trinity Tainted: G        W    3.4.0-rc1-next-20120405-sasha-dirty #57
> > >     [ 1293.227038] RIP: 0010:[<ffffffff822aacb0>]  [<ffffffff822aacb0>] tcp_register_app+0x60/0xb0
> > >     [ 1293.227038] RSP: 0018:ffff880038c1dd18  EFLAGS: 00010286
> > >     [ 1293.227038] RAX: ffffffffffffffc0 RBX: 0000000000001500 RCX: 0000000000010000
> > >     [ 1293.227038] RDX: 0000000000000000 RSI: ffff88003a2d5888 RDI: 0000000000000282
> > >     [ 1293.227038] RBP: ffff880038c1dd48 R08: 0000000000000000 R09: 0000000000000000
> > >     [ 1293.227038] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003a2d5668
> > >     [ 1293.227038] R13: ffff88003a2d5988 R14: ffff8800696a8ff8 R15: 0000000000000000
> > >     [ 1293.227038] FS:  00007f01930d9700(0000) GS:ffff88007ce00000(0000) knlGS:0000000000000000
> > >     [ 1293.227038] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> > >     [ 1293.227038] CR2: 0000000000000018 CR3: 0000000065dfc000 CR4: 00000000000406e0
> > >     [ 1293.227038] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > >     [ 1293.227038] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> > >     [ 1293.227038] Process trinity (pid: 19609, threadinfo ffff880038c1c000, task ffff88002dc73000)
> > >     [ 1293.227038] Stack:
> > >     [ 1293.227038]  ffff880038c1dd48 00000000fffffff4 ffff8800696aada0 ffff8800694f5580
> > >     [ 1293.227038]  ffffffff8369f1e0 0000000000001500 ffff880038c1dd98 ffffffff822a716b
> > >     [ 1293.227038]  0000000000000000 ffff8800696a8ff8 0000000000000015 ffff8800694f5580
> > >     [ 1293.227038] Call Trace:
> > >     [ 1293.227038]  [<ffffffff822a716b>] ip_vs_app_inc_new+0xdb/0x180
> > >     [ 1293.227038]  [<ffffffff822a7258>] register_ip_vs_app_inc+0x48/0x70
> > >     [ 1293.227038]  [<ffffffff822b2fea>] __ip_vs_ftp_init+0xba/0x140
> > >     [ 1293.227038]  [<ffffffff821c9060>] ops_init+0x80/0x90
> > >     [ 1293.227038]  [<ffffffff821c90cb>] setup_net+0x5b/0xe0
> > >     [ 1293.227038]  [<ffffffff821c9416>] copy_net_ns+0x76/0x100
> > >     [ 1293.227038]  [<ffffffff810dc92b>] create_new_namespaces+0xfb/0x190
> > >     [ 1293.227038]  [<ffffffff810dca21>] unshare_nsproxy_namespaces+0x61/0x80
> > >     [ 1293.227038]  [<ffffffff810afd1f>] sys_unshare+0xff/0x290
> > >     [ 1293.227038]  [<ffffffff8187622e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
> > >     [ 1293.227038]  [<ffffffff82665539>] system_call_fastpath+0x16/0x1b
> > >     [ 1293.227038] Code: 89 c7 e8 34 91 3b 00 89 de 66 c1 ee 04 31 de 83 e6 0f 48 83 c6 22 48 c1 e6 04 4a 8b 14 26 49 8d 34 34 48 8d 42 c0 48 39 d6 74 13 <66> 39 58 58 74 22 48 8b 48 40 48 8d 41 c0 48 39 ce 75 ed 49 8d
> > >     [ 1293.227038] RIP  [<ffffffff822aacb0>] tcp_register_app+0x60/0xb0
> > >     [ 1293.227038]  RSP <ffff880038c1dd18>
> > >     [ 1293.227038] CR2: 0000000000000018
> > >     [ 1293.379284] ---[ end trace 364ab40c7011a009 ]---
> > >     [ 1293.381182] Kernel panic - not syncing: Fatal exception in interrupt
> > >     
> > >     Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > >     Acked-by: Julian Anastasov <ja@ssi.bg>
> > >     Signed-off-by: Simon Horman <horms@verge.net.au>
> > >     Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > 
> 

^ permalink raw reply

* Re: [RFC] r8169 : why SG / TX checksum are default disabled
From: Francois Romieu @ 2012-07-17 23:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Hayes Wang
In-Reply-To: <1342564781.2626.1264.camel@edumazet-glaptop>

Eric Dumazet <eric.dumazet@gmail.com> :
[...]
> I was wondering why Scatter Gather (NETIF_F_SG) and TX checksum
> (NETIF_F_IP_CSUM) were disabled by default on r8169.

It was not unconditionally stable. For a number of reasons:
- incompatibility with jumbo frames.
  rtl_chip_infos now allows to take care of it
- lack of support for chipset dependent Tx descriptor 
  rtl_tx_desc_info handles it. There are yet a few unused ipv6 bits. No idea
  how useable they are.
- some false alarms due to different issues and probably some others
  I could hunt for in the archives.

> Couldnt we enable them by default, maybe for a whitelist of "good"
> nics ?

Sure.

Hayes, should we not add into the kernel driver something similar to
the rtl8168_start_xmit::skb_checksum_help stuff in Realtek's 8168 driver ?
There seems to be a bug for (skb->len < 60 && RTL_GIGA_MAC_VER_34.

> (I found that activating them with ethtool automatically enables GSO,
>  and performance with GSO is not good)

It's still an improvement though, isn't it ?

-- 
Ueimor

^ permalink raw reply

* Re: That's pretty much it for 3.5.0
From: John Fastabend @ 2012-07-17 23:27 UTC (permalink / raw)
  To: David Miller
  Cc: mark.d.rustad-ral2JQCrhuEAvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20120717.151832.1306978935355646723.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

On 7/17/2012 3:18 PM, David Miller wrote:
> From: John Fastabend <john.r.fastabend-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> Date: Tue, 17 Jul 2012 15:13:36 -0700
>
>> Perhaps the easiest way is to check net->count this should be zero
>> until setup_net is called.
>>
>> if (!atomic_read(&init_net.count))
>> 	return ret;
>>
>
> Won't work, setup_net() runs via a pure_initcall().
>

Why not must have missed something? cgroup_init() and
cgroup_early_init() both run before _initcall() routines are
called via kernel_init() so this will stop the update in
netprio from occurring.

And I don't see any race elsewhere for this.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 0/2] Pull request for 'davem-next.r8169' branch
From: Joe Perches @ 2012-07-17 22:46 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev, David Miller, Hayes Wang
In-Reply-To: <cover.1342562326.git.romieu@fr.zoreil.com>

On Wed, 2012-07-18 at 00:09 +0200, Francois Romieu wrote:
> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c

Hello Francois, just a bit of trivia:

> @@ -865,7 +865,8 @@ static bool rtl_loop_wait(struct rtl8169_private *tp, const struct rtl_cond *c,
>  		if (c->check(tp) == high)
>  			return true;
>  	}
> -	netif_err(tp, drv, tp->dev, c->msg);
> +	netif_err(tp, drv, tp->dev, "%s == %d (loop: %d, delay: %d).\n",
> +		  c->msg, !high, n, d);

Please avoid adding the period to a message before a newline.

$ git grep -E "[^\.]\\\\n\"" drivers/net/ethernet/realtek | wc -l
113
$ git grep -E "\.\\\\n\"" drivers/net/ethernet/realtek | wc -l
12

^ permalink raw reply

* [RFC] r8169 : why SG / TX checksum are default disabled
From: Eric Dumazet @ 2012-07-17 22:39 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev, Hayes Wang

Hi Francois

I was wondering why Scatter Gather (NETIF_F_SG) and TX checksum
(NETIF_F_IP_CSUM) were disabled by default on r8169.

Couldnt we enable them by default, maybe for a whitelist of "good"
nics ?

(I found that activating them with ethtool automatically enables GSO,
 and performance with GSO is not good)

Thanks

^ permalink raw reply

* [PATCH net-next] tcp: refine SYN handling in tcp_validate_incoming
From: Eric Dumazet @ 2012-07-17 22:29 UTC (permalink / raw)
  To: Vijay Subramanian; +Cc: David Miller, netdev, Kiran Kumar Kella
In-Reply-To: <1342563706.2626.1228.camel@edumazet-glaptop>

From: Eric Dumazet <edumazet@google.com>

Followup of commit 0c24604b68fc (tcp: implement RFC 5961 4.2)

As reported by Vijay Subramanian, we should send a challenge ACK
instead of a dup ack if a SYN flag is set on a packet received out of
window.

This permits the ratelimiting to work as intended, and to increase
correct SNMP counters.

Suggested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
---
 net/ipv4/tcp_input.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8aaec55..fdd49f1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5296,8 +5296,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		 * an acknowledgment should be sent in reply (unless the RST
 		 * bit is set, if so drop the segment and return)".
 		 */
-		if (!th->rst)
+		if (!th->rst) {
+			if (th->syn)
+				goto syn_challenge;
 			tcp_send_dupack(sk, skb);
+		}
 		goto discard;
 	}
 
@@ -5327,6 +5330,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	 * RFC 5691 4.2 : Send a challenge ack
 	 */
 	if (th->syn) {
+syn_challenge:
 		if (syn_inerr)
 			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);

^ permalink raw reply related

* [PATCH net-next 2/2] r8169: verbose error message.
From: Francois Romieu @ 2012-07-17 22:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hayes Wang
In-Reply-To: <cover.1342562326.git.romieu@fr.zoreil.com>

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Cc: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/ethernet/realtek/r8169.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 1f27318..be4e00f 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -865,7 +865,8 @@ static bool rtl_loop_wait(struct rtl8169_private *tp, const struct rtl_cond *c,
 		if (c->check(tp) == high)
 			return true;
 	}
-	netif_err(tp, drv, tp->dev, c->msg);
+	netif_err(tp, drv, tp->dev, "%s == %d (loop: %d, delay: %d).\n",
+		  c->msg, !high, n, d);
 	return false;
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH net-next 1/2] r8169: remove rtl_ocpdr_cond.
From: Francois Romieu @ 2012-07-17 22:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hayes Wang
In-Reply-To: <cover.1342562326.git.romieu@fr.zoreil.com>

From: Hayes Wang <hayeswang@realtek.com>

It is not needed for mac_ocp_{write / read}. Actually bit 31 of OCPDR
does not change and r8168_mac_ocp_read always returns ~0.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
Tested-by: Francois Romieu <romieu@fr.zoreil.com>
---
 drivers/net/ethernet/realtek/r8169.c |   12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index c29c5fb..1f27318 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1043,13 +1043,6 @@ static void rtl_w1w0_phy_ocp(struct rtl8169_private *tp, int reg, int p, int m)
 	r8168_phy_ocp_write(tp, reg, (val | p) & ~m);
 }
 
-DECLARE_RTL_COND(rtl_ocpdr_cond)
-{
-	void __iomem *ioaddr = tp->mmio_addr;
-
-	return RTL_R32(OCPDR) & OCPAR_FLAG;
-}
-
 static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -1058,8 +1051,6 @@ static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 		return;
 
 	RTL_W32(OCPDR, OCPAR_FLAG | (reg << 15) | data);
-
-	rtl_udelay_loop_wait_low(tp, &rtl_ocpdr_cond, 25, 10);
 }
 
 static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
@@ -1071,8 +1062,7 @@ static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
 
 	RTL_W32(OCPDR, reg << 15);
 
-	return rtl_udelay_loop_wait_high(tp, &rtl_ocpdr_cond, 25, 10) ?
-		RTL_R32(OCPDR) : ~0;
+	return RTL_R32(OCPDR);
 }
 
 #define OCP_STD_PHY_BASE	0xa400
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH net-next 0/2] Pull request for 'davem-next.r8169' branch
From: Francois Romieu @ 2012-07-17 22:09 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hayes Wang

Please pull from branch 'davem-next.r8169' in repository

git://violet.fr.zoreil.com/romieu/linux davem-next.r8169

to get the changes below.

Distance from 'davem-next' (5abf7f7e0f6bdbfcac737f636497d7016d9507eb)
---------------------------------------------------------------------

82e316efbd1c68946c8760f930b81d73e9c4425a
3a83ad12b850c3c5b89fa9008bdd0c0782f0cf68

Diffstat
--------

 drivers/net/ethernet/realtek/r8169.c |   15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

Shortlog
--------

Francois Romieu (1):
      r8169: verbose error message.

Hayes Wang (1):
      r8169: remove rtl_ocpdr_cond.

Patch
-----

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index c29c5fb..be4e00f 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -865,7 +865,8 @@ static bool rtl_loop_wait(struct rtl8169_private *tp, const struct rtl_cond *c,
 		if (c->check(tp) == high)
 			return true;
 	}
-	netif_err(tp, drv, tp->dev, c->msg);
+	netif_err(tp, drv, tp->dev, "%s == %d (loop: %d, delay: %d).\n",
+		  c->msg, !high, n, d);
 	return false;
 }
 
@@ -1043,13 +1044,6 @@ static void rtl_w1w0_phy_ocp(struct rtl8169_private *tp, int reg, int p, int m)
 	r8168_phy_ocp_write(tp, reg, (val | p) & ~m);
 }
 
-DECLARE_RTL_COND(rtl_ocpdr_cond)
-{
-	void __iomem *ioaddr = tp->mmio_addr;
-
-	return RTL_R32(OCPDR) & OCPAR_FLAG;
-}
-
 static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -1058,8 +1052,6 @@ static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 		return;
 
 	RTL_W32(OCPDR, OCPAR_FLAG | (reg << 15) | data);
-
-	rtl_udelay_loop_wait_low(tp, &rtl_ocpdr_cond, 25, 10);
 }
 
 static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
@@ -1071,8 +1063,7 @@ static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
 
 	RTL_W32(OCPDR, reg << 15);
 
-	return rtl_udelay_loop_wait_high(tp, &rtl_ocpdr_cond, 25, 10) ?
-		RTL_R32(OCPDR) : ~0;
+	return RTL_R32(OCPDR);
 }
 
 #define OCP_STD_PHY_BASE	0xa400
-- 
Ueimor

^ permalink raw reply related

* Re: [PATCH net-next] tcp: implement RFC 5961 4.2
From: Eric Dumazet @ 2012-07-17 22:21 UTC (permalink / raw)
  To: Vijay Subramanian; +Cc: David Miller, netdev, Kiran Kumar Kella
In-Reply-To: <CAGK4HS8MvP6L5Rvy4wJx2KhdTSSHfP7YPT44e9mrV_vsBZJ9jQ@mail.gmail.com>

On Tue, 2012-07-17 at 15:10 -0700, Vijay Subramanian wrote:

> Yes. This is what I had in mind (along with a change to make
> tcp_sequence() return bool). I am not sure if this patch is official
> (or will be picked up by patchwork) but
> for what its worth
> 
> Acked-by: Vijay Subramanian <subramanian.vijay@gmail.com>

Well, I am going to send an official patch with all credits ASAP,

Thanks !

^ permalink raw reply

* [PATCH net-next] bonding: refine IFF_XMIT_DST_RELEASE capability
From: Eric Dumazet @ 2012-07-17 22:19 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Jay Vosburgh, Andy Gospodarek, Tom Herbert

From: Eric Dumazet <edumazet@google.com>

Some workloads greatly benefit of IFF_XMIT_DST_RELEASE capability
on output net device, avoiding dirtying dst refcount.

bonding currently disables IFF_XMIT_DST_RELEASE unconditionally.

If all slaves have the IFF_XMIT_DST_RELEASE bit set, then
bonding master can also have it in its priv_flags

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jay Vosburgh <fubar@us.ibm.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Tom Herbert <therbert@google.com>
---
 drivers/net/bonding/bond_main.c |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1eb3979..3960b1b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1382,6 +1382,7 @@ static void bond_compute_features(struct bonding *bond)
 	netdev_features_t vlan_features = BOND_VLAN_FEATURES;
 	unsigned short max_hard_header_len = ETH_HLEN;
 	int i;
+	unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
 
 	read_lock(&bond->lock);
 
@@ -1392,6 +1393,7 @@ static void bond_compute_features(struct bonding *bond)
 		vlan_features = netdev_increment_features(vlan_features,
 			slave->dev->vlan_features, BOND_VLAN_FEATURES);
 
+		dst_release_flag &= slave->dev->priv_flags;
 		if (slave->dev->hard_header_len > max_hard_header_len)
 			max_hard_header_len = slave->dev->hard_header_len;
 	}
@@ -1400,6 +1402,9 @@ done:
 	bond_dev->vlan_features = vlan_features;
 	bond_dev->hard_header_len = max_hard_header_len;
 
+	flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
+	bond_dev->priv_flags = flags | dst_release_flag;
+
 	read_unlock(&bond->lock);
 
 	netdev_change_features(bond_dev);

^ permalink raw reply related

* Re: That's pretty much it for 3.5.0
From: David Miller @ 2012-07-17 22:18 UTC (permalink / raw)
  To: john.r.fastabend; +Cc: mark.d.rustad, netdev, linux-wireless, netfilter-devel
In-Reply-To: <5005E390.7020706@intel.com>

From: John Fastabend <john.r.fastabend@intel.com>
Date: Tue, 17 Jul 2012 15:13:36 -0700

> Perhaps the easiest way is to check net->count this should be zero
> until setup_net is called.
> 
> if (!atomic_read(&init_net.count))
> 	return ret;
> 

Won't work, setup_net() runs via a pure_initcall().

^ permalink raw reply

* Re: That's pretty much it for 3.5.0
From: John Fastabend @ 2012-07-17 22:13 UTC (permalink / raw)
  To: David Miller; +Cc: mark.d.rustad, netdev, linux-wireless, netfilter-devel
In-Reply-To: <20120717.140241.1599386555723262095.davem@davemloft.net>

On 7/17/2012 2:02 PM, David Miller wrote:
> From: John Fastabend <john.r.fastabend@intel.com>
> Date: Tue, 17 Jul 2012 13:50:16 -0700
>
>> On 7/17/2012 12:24 PM, David Miller wrote:
>>> From: John Fastabend <john.r.fastabend@intel.com>
>>> Date: Tue, 17 Jul 2012 12:09:53 -0700
>>>
>>>> although we don't have an early_init hook for netprio_cgroup so this
>>>> is probably not correct.
>>>
>>> The dependency is actually on net_dev_init (a subsys_initcall) rather
>>> than a pure_initcall.
>>>
>>> net_dev_init is what registers the netdev_net_ops, which in turn
>>> initializes the netdev list in namespaces such as &init_net
>>>
>>
>> Ah right thanks sorry for the thrash. I guess we need to check if the
>> netdev list in the init_net namespace is initialized.
>
> It's a hack, but we could export and then test dev_boot_phase == 0,
> and if that test is true then skip the init_net device walk in the
> cgroup code.
>
> But I don't like that very much.
>
> The things this code cares about can't even be an issue until
> net_dev_init() runs.
>
> There is a comment warning not to do this in linux/init.h, but we
> could change the module_init() in netprio_cgroup.c to some level which
> runs after subsys_inticall().  When built as a module, linux/init.h
> will translate this into module_init() which is basically the behavior
> we want.
>

Perhaps the easiest way is to check net->count this should be zero
until setup_net is called.

if (!atomic_read(&init_net.count))
	return ret;

^ permalink raw reply

* Re: [PATCH net-next] tcp: implement RFC 5961 4.2
From: Vijay Subramanian @ 2012-07-17 22:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Kiran Kumar Kella
In-Reply-To: <1342560769.2626.1165.camel@edumazet-glaptop>

> But you probably are right, we could test th->syn here as well.
>
> Something like that ?

> -               if (!th->rst)
> +               if (!th->rst) {
> +                       if (th->syn)
> +                               goto syn_challenge;
>                         tcp_send_dupack(sk, skb);
> +               }
>                 goto discard;
>         }
>
> @@ -5327,6 +5330,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
>          * RFC 5691 4.2 : Send a challenge ack
>          */
>         if (th->syn) {
> +syn_challenge:
>                 if (syn_inerr)
>                         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
>                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
>

Yes. This is what I had in mind (along with a change to make
tcp_sequence() return bool). I am not sure if this patch is official
(or will be picked up by patchwork) but
for what its worth

Acked-by: Vijay Subramanian <subramanian.vijay@gmail.com>

I will send a separate patch to make tcp_sequence() return bool.
Thanks!
Vijay

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox