Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next-2.6 PATCH 11/20] igb: add a flags value to the ring
From: Jeff Kirsher @ 2009-10-28  1:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds a flags value to the ring that cleans up some of the last
remaining items from the ring in order to help seperate it from the adapter
struct.  By implementing these flags it becomes possible for different rings
to support different functions such as rx checksumming.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h         |   12 ++++-
 drivers/net/igb/igb_ethtool.c |   13 ++++--
 drivers/net/igb/igb_main.c    |   93 +++++++++++++++++++----------------------
 3 files changed, 60 insertions(+), 58 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 6a67fa2..0c30c5e 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -192,6 +192,8 @@ struct igb_ring {
 	unsigned int total_bytes;
 	unsigned int total_packets;
 
+	u32 flags;
+
 	union {
 		/* TX */
 		struct {
@@ -206,6 +208,13 @@ struct igb_ring {
 	};
 };
 
+#define IGB_RING_FLAG_RX_CSUM        0x00000001 /* RX CSUM enabled */
+#define IGB_RING_FLAG_RX_SCTP_CSUM   0x00000002 /* SCTP CSUM offload enabled */
+
+#define IGB_RING_FLAG_TX_CTX_IDX     0x00000001 /* HW requires context index */
+
+#define IGB_ADVTXD_DCMD (E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS)
+
 #define E1000_RX_DESC_ADV(R, i)	    \
 	(&(((union e1000_adv_rx_desc *)((R).desc))[i]))
 #define E1000_TX_DESC_ADV(R, i)	    \
@@ -245,7 +254,6 @@ struct igb_adapter {
 	/* TX */
 	struct igb_ring *tx_ring;      /* One per active queue */
 	unsigned long tx_queue_len;
-	u32 txd_cmd;
 	u32 gotc;
 	u64 gotc_old;
 	u64 tpt_old;
@@ -303,8 +311,6 @@ struct igb_adapter {
 #define IGB_FLAG_HAS_MSI           (1 << 0)
 #define IGB_FLAG_DCA_ENABLED       (1 << 1)
 #define IGB_FLAG_QUAD_PORT_A       (1 << 2)
-#define IGB_FLAG_NEED_CTX_IDX      (1 << 3)
-#define IGB_FLAG_RX_CSUM_DISABLED  (1 << 4)
 
 enum e1000_state_t {
 	__IGB_TESTING,
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index f62430b..c44dede 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -279,17 +279,20 @@ static int igb_set_pauseparam(struct net_device *netdev,
 static u32 igb_get_rx_csum(struct net_device *netdev)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
-	return !(adapter->flags & IGB_FLAG_RX_CSUM_DISABLED);
+	return !!(adapter->rx_ring[0].flags & IGB_RING_FLAG_RX_CSUM);
 }
 
 static int igb_set_rx_csum(struct net_device *netdev, u32 data)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
+	int i;
 
-	if (data)
-		adapter->flags &= ~IGB_FLAG_RX_CSUM_DISABLED;
-	else
-		adapter->flags |= IGB_FLAG_RX_CSUM_DISABLED;
+	for (i = 0; i < adapter->num_rx_queues; i++) {
+		if (data)
+			adapter->rx_ring[i].flags |= IGB_RING_FLAG_RX_CSUM;
+		else
+			adapter->rx_ring[i].flags &= ~IGB_RING_FLAG_RX_CSUM;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index bdd7bf0..00f3f2d 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -437,13 +437,21 @@ static int igb_alloc_queues(struct igb_adapter *adapter)
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = i;
 		ring->pdev = adapter->pdev;
+		/* For 82575, context index must be unique per ring. */
+		if (adapter->hw.mac.type == e1000_82575)
+			ring->flags = IGB_RING_FLAG_TX_CTX_IDX;
 	}
+
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		struct igb_ring *ring = &(adapter->rx_ring[i]);
 		ring->count = adapter->rx_ring_count;
 		ring->queue_index = i;
 		ring->pdev = adapter->pdev;
 		ring->rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
+		ring->flags = IGB_RING_FLAG_RX_CSUM; /* enable rx checksum */
+		/* set flag indicating ring supports SCTP checksum offload */
+		if (adapter->hw.mac.type >= e1000_82576)
+			ring->flags |= IGB_RING_FLAG_RX_SCTP_CSUM;
 	}
 
 	igb_cache_ring_register(adapter);
@@ -1517,16 +1525,6 @@ static int __devinit igb_probe(struct pci_dev *pdev,
 
 	igb_get_bus_info_pcie(hw);
 
-	/* set flags */
-	switch (hw->mac.type) {
-	case e1000_82575:
-		adapter->flags |= IGB_FLAG_NEED_CTX_IDX;
-		break;
-	case e1000_82576:
-	default:
-		break;
-	}
-
 	hw->phy.autoneg_wait_to_complete = false;
 	hw->mac.adaptive_ifs = true;
 
@@ -2149,9 +2147,6 @@ static void igb_configure_tx(struct igb_adapter *adapter)
 
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		igb_configure_tx_ring(adapter, &adapter->tx_ring[i]);
-
-	/* Setup Transmit Descriptor Settings for eop descriptor */
-	adapter->txd_cmd = E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS;
 }
 
 /**
@@ -3272,8 +3267,7 @@ set_itr_now:
 #define IGB_TX_FLAGS_VLAN_MASK	0xffff0000
 #define IGB_TX_FLAGS_VLAN_SHIFT	16
 
-static inline int igb_tso_adv(struct igb_adapter *adapter,
-			      struct igb_ring *tx_ring,
+static inline int igb_tso_adv(struct igb_ring *tx_ring,
 			      struct sk_buff *skb, u32 tx_flags, u8 *hdr_len)
 {
 	struct e1000_adv_tx_context_desc *context_desc;
@@ -3335,8 +3329,8 @@ static inline int igb_tso_adv(struct igb_adapter *adapter,
 	mss_l4len_idx |= (l4len << E1000_ADVTXD_L4LEN_SHIFT);
 
 	/* For 82575, context index must be unique per ring. */
-	if (adapter->flags & IGB_FLAG_NEED_CTX_IDX)
-		mss_l4len_idx |= tx_ring->queue_index << 4;
+	if (tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX)
+		mss_l4len_idx |= tx_ring->reg_idx << 4;
 
 	context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx);
 	context_desc->seqnum_seed = 0;
@@ -3353,9 +3347,8 @@ static inline int igb_tso_adv(struct igb_adapter *adapter,
 	return true;
 }
 
-static inline bool igb_tx_csum_adv(struct igb_adapter *adapter,
-					struct igb_ring *tx_ring,
-					struct sk_buff *skb, u32 tx_flags)
+static inline bool igb_tx_csum_adv(struct igb_ring *tx_ring,
+				   struct sk_buff *skb, u32 tx_flags)
 {
 	struct e1000_adv_tx_context_desc *context_desc;
 	struct pci_dev *pdev = tx_ring->pdev;
@@ -3417,11 +3410,9 @@ static inline bool igb_tx_csum_adv(struct igb_adapter *adapter,
 
 		context_desc->type_tucmd_mlhl = cpu_to_le32(tu_cmd);
 		context_desc->seqnum_seed = 0;
-		if (adapter->flags & IGB_FLAG_NEED_CTX_IDX)
+		if (tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX)
 			context_desc->mss_l4len_idx =
-				cpu_to_le32(tx_ring->queue_index << 4);
-		else
-			context_desc->mss_l4len_idx = 0;
+				cpu_to_le32(tx_ring->reg_idx << 4);
 
 		buffer_info->time_stamp = jiffies;
 		buffer_info->next_to_watch = i;
@@ -3492,8 +3483,7 @@ static inline int igb_tx_map_adv(struct igb_ring *tx_ring, struct sk_buff *skb,
 	return count + 1;
 }
 
-static inline void igb_tx_queue_adv(struct igb_adapter *adapter,
-				    struct igb_ring *tx_ring,
+static inline void igb_tx_queue_adv(struct igb_ring *tx_ring,
 				    int tx_flags, int count, u32 paylen,
 				    u8 hdr_len)
 {
@@ -3525,10 +3515,11 @@ static inline void igb_tx_queue_adv(struct igb_adapter *adapter,
 		olinfo_status |= E1000_TXD_POPTS_TXSM << 8;
 	}
 
-	if ((adapter->flags & IGB_FLAG_NEED_CTX_IDX) &&
-	    (tx_flags & (IGB_TX_FLAGS_CSUM | IGB_TX_FLAGS_TSO |
+	if ((tx_ring->flags & IGB_RING_FLAG_TX_CTX_IDX) &&
+	    (tx_flags & (IGB_TX_FLAGS_CSUM |
+	                 IGB_TX_FLAGS_TSO |
 			 IGB_TX_FLAGS_VLAN)))
-		olinfo_status |= tx_ring->queue_index << 4;
+		olinfo_status |= tx_ring->reg_idx << 4;
 
 	olinfo_status |= ((paylen - hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT);
 
@@ -3545,7 +3536,7 @@ static inline void igb_tx_queue_adv(struct igb_adapter *adapter,
 			i = 0;
 	}
 
-	tx_desc->read.cmd_type_len |= cpu_to_le32(adapter->txd_cmd);
+	tx_desc->read.cmd_type_len |= cpu_to_le32(IGB_ADVTXD_DCMD);
 	/* Force memory writes to complete before letting h/w
 	 * know there are new descriptors to fetch.  (Only
 	 * applicable for weak-ordered memory model archs,
@@ -3644,17 +3635,17 @@ static netdev_tx_t igb_xmit_frame_ring_adv(struct sk_buff *skb,
 		tx_flags |= IGB_TX_FLAGS_IPV4;
 
 	first = tx_ring->next_to_use;
-	tso = skb_is_gso(skb) ? igb_tso_adv(adapter, tx_ring, skb, tx_flags,
-					      &hdr_len) : 0;
-
-	if (tso < 0) {
-		dev_kfree_skb_any(skb);
-		return NETDEV_TX_OK;
+	if (skb_is_gso(skb)) {
+		tso = igb_tso_adv(tx_ring, skb, tx_flags, &hdr_len);
+		if (tso < 0) {
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
 	}
 
 	if (tso)
 		tx_flags |= IGB_TX_FLAGS_TSO;
-	else if (igb_tx_csum_adv(adapter, tx_ring, skb, tx_flags) &&
+	else if (igb_tx_csum_adv(tx_ring, skb, tx_flags) &&
 	         (skb->ip_summed == CHECKSUM_PARTIAL))
 		tx_flags |= IGB_TX_FLAGS_CSUM;
 
@@ -3664,17 +3655,18 @@ static netdev_tx_t igb_xmit_frame_ring_adv(struct sk_buff *skb,
 	 */
 	count = igb_tx_map_adv(tx_ring, skb, first);
 
-	if (count) {
-		igb_tx_queue_adv(adapter, tx_ring, tx_flags, count,
-			         skb->len, hdr_len);
-		/* Make sure there is space in the ring for the next send. */
-		igb_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 4);
-	} else {
+	if (!count) {
 		dev_kfree_skb_any(skb);
 		tx_ring->buffer_info[first].time_stamp = 0;
 		tx_ring->next_to_use = first;
+		return NETDEV_TX_OK;
 	}
 
+	igb_tx_queue_adv(tx_ring, tx_flags, count, skb->len, hdr_len);
+
+	/* Make sure there is space in the ring for the next send. */
+	igb_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 4);
+
 	return NETDEV_TX_OK;
 }
 
@@ -4800,15 +4792,15 @@ static void igb_receive_skb(struct igb_q_vector *q_vector,
 }
 
 static inline void igb_rx_checksum_adv(struct igb_ring *ring,
-                                       struct igb_adapter *adapter,
 				       u32 status_err, struct sk_buff *skb)
 {
 	skb->ip_summed = CHECKSUM_NONE;
 
 	/* Ignore Checksum bit is set or checksum is disabled through ethtool */
-	if ((status_err & E1000_RXD_STAT_IXSM) ||
-	    (adapter->flags & IGB_FLAG_RX_CSUM_DISABLED))
+	if (!(ring->flags & IGB_RING_FLAG_RX_CSUM) ||
+	     (status_err & E1000_RXD_STAT_IXSM))
 		return;
+
 	/* TCP/UDP checksum error bit is set */
 	if (status_err &
 	    (E1000_RXDEXT_STATERR_TCPE | E1000_RXDEXT_STATERR_IPE)) {
@@ -4817,9 +4809,10 @@ static inline void igb_rx_checksum_adv(struct igb_ring *ring,
 		 * L4E bit is set incorrectly on 64 byte (60 byte w/o crc)
 		 * packets, (aka let the stack check the crc32c)
 		 */
-		if (!((adapter->hw.mac.type == e1000_82576) &&
-		      (skb->len == 60)))
+		if ((skb->len == 60) &&
+		    (ring->flags & IGB_RING_FLAG_RX_SCTP_CSUM))
 			ring->rx_stats.csum_err++;
+
 		/* let the stack verify checksum errors */
 		return;
 	}
@@ -4827,7 +4820,7 @@ static inline void igb_rx_checksum_adv(struct igb_ring *ring,
 	if (status_err & (E1000_RXD_STAT_TCPCS | E1000_RXD_STAT_UDPCS))
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	dev_dbg(&adapter->pdev->dev, "cksum success: bits %08X\n", status_err);
+	dev_dbg(&ring->pdev->dev, "cksum success: bits %08X\n", status_err);
 }
 
 static inline u16 igb_get_hlen(struct igb_ring *rx_ring,
@@ -4978,7 +4971,7 @@ send_up:
 		total_bytes += skb->len;
 		total_packets++;
 
-		igb_rx_checksum_adv(rx_ring, adapter, staterr, skb);
+		igb_rx_checksum_adv(rx_ring, staterr, skb);
 
 		skb->protocol = eth_type_trans(skb, netdev);
 		skb_record_rx_queue(skb, rx_ring->queue_index);


^ permalink raw reply related

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-28  1:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Stephen Hemminger, Andrew Morton, Octavian Purdila,
	netdev, linux-kernel, Al Viro
In-Reply-To: <alpine.LFD.2.01.0910271744560.31845@localhost.localdomain>

On Tue, 27 Oct 2009 17:58:53 -0700 (PDT)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 27 Oct 2009, Stephen Hemminger wrote:
> > 
> > Agreed. Here is the reduced version of the program.
> > To run:
> >   find /home -printf '%f\n' 2>/dev/null | ./htest -n 100
> 
> The timings are very sensitive to random I$ layout at least on Nehalem. 
> The reason seems to be that the inner loop is _so_ tight that just 
> depending on exactly where the loop ends up, you can get subtle 
> interactions with the loop cache.
> 
> Look here:
> 
> 	[torvalds@nehalem ~]$ find /home -printf '%f\n' 2>/dev/null | ./htest -n 100
> 	Algorithm             Time       Ratio       Max   StdDev
> 	full_name_hash       1.141899     1.03      4868 263.37
> 	djb2                 0.980200     1.03      4835 266.05
> 	string10             0.909175     1.03      4850 262.67
> 	string10a            0.673915     1.03      4850 262.67
> 	string10b            0.909374     1.03      4850 262.67
> 	string_hash17        0.966050     1.03      4805 263.68
> 	string_hash31        1.008544     1.03      4807 259.37
> 	fnv32                0.774806     1.03      4817 259.17
> 
> what do you think the difference between 'string10', 'string10a' and 
> 'string10b' are?
> 
> None. None what-so-ever. The source code is identical, and gcc generates 
> identical assembly language. Yet those timings are extremely stable for 
> me, and 'string10b' is 25% faster than the identical string10 and 
> string10a functions.
> 
> The only difference? 'string10a' starts aligned to just 16 bytes, but that 
> in turn happens to mean that the tight inner loop ends up aligned on a 
> 128-byte boundary. And being cacheline aligned just there seems to matters 
> for some subtle micro-architectural reason.
> 
> The reason I noticed this is that I wondered what small modifications to 
> 'string10' would do for performance, and noticed that even _without_ the 
> small modifications, performance fluctuated.
> 
> Lesson? Microbenchmarks like this can be dangerous and misleading. That's 
> _especially_ true if the loop ends up being just tight enough that it can 
> fit in some trace cache or similar. In real life, the name hash is 
> performance-critical, but at the same time almost certainly won't be run 
> in a tight enough loop that you'd ever notice things like that.
> 
> 		Linus

Thanks. I wasn't putting huge amount of stock in the micro benchmark,
was more interested in how the distribution worked out (which is CPU
independent) rather than the time. As long as all usage of name hashing
fold properly, there isn't a lot of reason to change.



-- 

^ permalink raw reply

* [net-next-2.6 PATCH 10/20] igb: move alloc_failed and csum_err stats into per rx-ring stat
From: Jeff Kirsher @ 2009-10-28  1:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

The allocation failed and checksum error stats are currently kept as a
global stat.  If we end up allocating the queues to multiple netdevs then
the global counter doesn't make much sense.  For this reason I felt it
necessary to move the alloc_rx_buff_failed stat into the rx_stats
portion of the rx_ring.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h         |    6 +++---
 drivers/net/igb/igb_ethtool.c |    9 ++++++---
 drivers/net/igb/igb_main.c    |   17 ++++++++---------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 00ff274..6a67fa2 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -145,12 +145,15 @@ struct igb_buffer {
 struct igb_tx_queue_stats {
 	u64 packets;
 	u64 bytes;
+	u64 restart_queue;
 };
 
 struct igb_rx_queue_stats {
 	u64 packets;
 	u64 bytes;
 	u64 drops;
+	u64 csum_err;
+	u64 alloc_failed;
 };
 
 struct igb_q_vector {
@@ -241,7 +244,6 @@ struct igb_adapter {
 
 	/* TX */
 	struct igb_ring *tx_ring;      /* One per active queue */
-	unsigned int restart_queue;
 	unsigned long tx_queue_len;
 	u32 txd_cmd;
 	u32 gotc;
@@ -255,8 +257,6 @@ struct igb_adapter {
 	int num_tx_queues;
 	int num_rx_queues;
 
-	u64 hw_csum_err;
-	u32 alloc_rx_buff_failed;
 	u32 gorc;
 	u64 gorc_old;
 	u32 max_frame_size;
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index c48a555..f62430b 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -84,7 +84,6 @@ static const struct igb_stats igb_gstrings_stats[] = {
 	{ "tx_single_coll_ok", IGB_STAT(stats.scc) },
 	{ "tx_multi_coll_ok", IGB_STAT(stats.mcc) },
 	{ "tx_timeout_count", IGB_STAT(tx_timeout_count) },
-	{ "tx_restart_queue", IGB_STAT(restart_queue) },
 	{ "rx_long_length_errors", IGB_STAT(stats.roc) },
 	{ "rx_short_length_errors", IGB_STAT(stats.ruc) },
 	{ "rx_align_errors", IGB_STAT(stats.algnerrc) },
@@ -95,9 +94,7 @@ static const struct igb_stats igb_gstrings_stats[] = {
 	{ "tx_flow_control_xon", IGB_STAT(stats.xontxc) },
 	{ "tx_flow_control_xoff", IGB_STAT(stats.xofftxc) },
 	{ "rx_long_byte_count", IGB_STAT(stats.gorc) },
-	{ "rx_csum_offload_errors", IGB_STAT(hw_csum_err) },
 	{ "tx_dma_out_of_sync", IGB_STAT(stats.doosync) },
-	{ "alloc_rx_buff_failed", IGB_STAT(alloc_rx_buff_failed) },
 	{ "tx_smbus", IGB_STAT(stats.mgptc) },
 	{ "rx_smbus", IGB_STAT(stats.mgprc) },
 	{ "dropped_smbus", IGB_STAT(stats.mgpdc) },
@@ -2031,6 +2028,8 @@ static void igb_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 			p += ETH_GSTRING_LEN;
 			sprintf(p, "tx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
+			sprintf(p, "tx_queue_%u_restart", i);
+			p += ETH_GSTRING_LEN;
 		}
 		for (i = 0; i < adapter->num_rx_queues; i++) {
 			sprintf(p, "rx_queue_%u_packets", i);
@@ -2039,6 +2038,10 @@ static void igb_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 			p += ETH_GSTRING_LEN;
 			sprintf(p, "rx_queue_%u_drops", i);
 			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_queue_%u_csum_err", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_queue_%u_alloc_failed", i);
+			p += ETH_GSTRING_LEN;
 		}
 /*		BUG_ON(p - data != IGB_STATS_LEN * ETH_GSTRING_LEN); */
 		break;
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 04e860d..bdd7bf0 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -3562,8 +3562,6 @@ static inline void igb_tx_queue_adv(struct igb_adapter *adapter,
 static int __igb_maybe_stop_tx(struct net_device *netdev,
 			       struct igb_ring *tx_ring, int size)
 {
-	struct igb_adapter *adapter = netdev_priv(netdev);
-
 	netif_stop_subqueue(netdev, tx_ring->queue_index);
 
 	/* Herbert's original patch had:
@@ -3578,7 +3576,7 @@ static int __igb_maybe_stop_tx(struct net_device *netdev,
 
 	/* A reprieve! */
 	netif_wake_subqueue(netdev, tx_ring->queue_index);
-	++adapter->restart_queue;
+	tx_ring->tx_stats.restart_queue++;
 	return 0;
 }
 
@@ -4734,7 +4732,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 		if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&
 		    !(test_bit(__IGB_DOWN, &adapter->state))) {
 			netif_wake_subqueue(netdev, tx_ring->queue_index);
-			++adapter->restart_queue;
+			tx_ring->tx_stats.restart_queue++;
 		}
 	}
 
@@ -4801,7 +4799,8 @@ static void igb_receive_skb(struct igb_q_vector *q_vector,
 		napi_gro_receive(&q_vector->napi, skb);
 }
 
-static inline void igb_rx_checksum_adv(struct igb_adapter *adapter,
+static inline void igb_rx_checksum_adv(struct igb_ring *ring,
+                                       struct igb_adapter *adapter,
 				       u32 status_err, struct sk_buff *skb)
 {
 	skb->ip_summed = CHECKSUM_NONE;
@@ -4820,7 +4819,7 @@ static inline void igb_rx_checksum_adv(struct igb_adapter *adapter,
 		 */
 		if (!((adapter->hw.mac.type == e1000_82576) &&
 		      (skb->len == 60)))
-			adapter->hw_csum_err++;
+			ring->rx_stats.csum_err++;
 		/* let the stack verify checksum errors */
 		return;
 	}
@@ -4979,7 +4978,7 @@ send_up:
 		total_bytes += skb->len;
 		total_packets++;
 
-		igb_rx_checksum_adv(adapter, staterr, skb);
+		igb_rx_checksum_adv(rx_ring, adapter, staterr, skb);
 
 		skb->protocol = eth_type_trans(skb, netdev);
 		skb_record_rx_queue(skb, rx_ring->queue_index);
@@ -5046,7 +5045,7 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 			if (!buffer_info->page) {
 				buffer_info->page = alloc_page(GFP_ATOMIC);
 				if (!buffer_info->page) {
-					adapter->alloc_rx_buff_failed++;
+					rx_ring->rx_stats.alloc_failed++;
 					goto no_buffers;
 				}
 				buffer_info->page_offset = 0;
@@ -5063,7 +5062,7 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 		if (!buffer_info->skb) {
 			skb = netdev_alloc_skb_ip_align(netdev, bufsz);
 			if (!skb) {
-				adapter->alloc_rx_buff_failed++;
+				rx_ring->rx_stats.alloc_failed++;
 				goto no_buffers;
 			}
 


^ permalink raw reply related

* [net-next-2.6 PATCH 09/20] igb: move rx_buffer_len into the ring structure
From: Jeff Kirsher @ 2009-10-28  1:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch moves the rx_buffer_len value into the ring structure.  This allows
greater flexibility and the option of doing things such as supporting packet
split only on some queues, or enabling virtualization.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    3 +--
 drivers/net/igb/igb_main.c |   41 ++++++++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index de26862..00ff274 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -198,7 +198,7 @@ struct igb_ring {
 		/* RX */
 		struct {
 			struct igb_rx_queue_stats rx_stats;
-			u64 rx_queue_drops;
+			u32 rx_buffer_len;
 		};
 	};
 };
@@ -218,7 +218,6 @@ struct igb_adapter {
 	struct vlan_group *vlgrp;
 	u16 mng_vlan_id;
 	u32 bd_number;
-	u32 rx_buffer_len;
 	u32 wol;
 	u32 en_mng_pt;
 	u16 link_speed;
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index ff16b7a..04e860d 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -443,6 +443,7 @@ static int igb_alloc_queues(struct igb_adapter *adapter)
 		ring->count = adapter->rx_ring_count;
 		ring->queue_index = i;
 		ring->pdev = adapter->pdev;
+		ring->rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
 	}
 
 	igb_cache_ring_register(adapter);
@@ -1863,7 +1864,6 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 
 	adapter->tx_ring_count = IGB_DEFAULT_TXD;
 	adapter->rx_ring_count = IGB_DEFAULT_RXD;
-	adapter->rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
 	adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN;
 	adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
 
@@ -2358,8 +2358,8 @@ static void igb_configure_rx_ring(struct igb_adapter *adapter,
 	writel(0, ring->tail);
 
 	/* set descriptor configuration */
-	if (adapter->rx_buffer_len < IGB_RXBUFFER_1024) {
-		srrctl = ALIGN(adapter->rx_buffer_len, 64) <<
+	if (ring->rx_buffer_len < IGB_RXBUFFER_1024) {
+		srrctl = ALIGN(ring->rx_buffer_len, 64) <<
 		         E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
 #if (PAGE_SIZE / 2) > IGB_RXBUFFER_16384
 		srrctl |= IGB_RXBUFFER_16384 >>
@@ -2370,7 +2370,7 @@ static void igb_configure_rx_ring(struct igb_adapter *adapter,
 #endif
 		srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
 	} else {
-		srrctl = ALIGN(adapter->rx_buffer_len, 1024) >>
+		srrctl = ALIGN(ring->rx_buffer_len, 1024) >>
 		         E1000_SRRCTL_BSIZEPKT_SHIFT;
 		srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
 	}
@@ -2619,7 +2619,6 @@ static void igb_free_all_rx_resources(struct igb_adapter *adapter)
  **/
 static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 {
-	struct igb_adapter *adapter = rx_ring->q_vector->adapter;
 	struct igb_buffer *buffer_info;
 	unsigned long size;
 	unsigned int i;
@@ -2632,7 +2631,7 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 		if (buffer_info->dma) {
 			pci_unmap_single(rx_ring->pdev,
 			                 buffer_info->dma,
-					 adapter->rx_buffer_len,
+					 rx_ring->rx_buffer_len,
 					 PCI_DMA_FROMDEVICE);
 			buffer_info->dma = 0;
 		}
@@ -3746,6 +3745,7 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN;
+	u32 rx_buffer_len, i;
 
 	if ((max_frame < ETH_ZLEN + ETH_FCS_LEN) ||
 	    (max_frame > MAX_JUMBO_FRAME_SIZE)) {
@@ -3763,9 +3763,6 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 
 	/* igb_down has a dependency on max_frame_size */
 	adapter->max_frame_size = max_frame;
-	if (netif_running(netdev))
-		igb_down(adapter);
-
 	/* NOTE: netdev_alloc_skb reserves 16 bytes, and typically NET_IP_ALIGN
 	 * means we reserve 2 more, this pushes us to allocate from the next
 	 * larger slab size.
@@ -3773,16 +3770,22 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 	 */
 
 	if (max_frame <= IGB_RXBUFFER_1024)
-		adapter->rx_buffer_len = IGB_RXBUFFER_1024;
+		rx_buffer_len = IGB_RXBUFFER_1024;
 	else if (max_frame <= MAXIMUM_ETHERNET_VLAN_SIZE)
-		adapter->rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
+		rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
 	else
-		adapter->rx_buffer_len = IGB_RXBUFFER_128;
+		rx_buffer_len = IGB_RXBUFFER_128;
+
+	if (netif_running(netdev))
+		igb_down(adapter);
 
 	dev_info(&adapter->pdev->dev, "changing MTU from %d to %d\n",
 		 netdev->mtu, new_mtu);
 	netdev->mtu = new_mtu;
 
+	for (i = 0; i < adapter->num_rx_queues; i++)
+		adapter->rx_ring[i].rx_buffer_len = rx_buffer_len;
+
 	if (netif_running(netdev))
 		igb_up(adapter);
 	else
@@ -4828,7 +4831,7 @@ static inline void igb_rx_checksum_adv(struct igb_adapter *adapter,
 	dev_dbg(&adapter->pdev->dev, "cksum success: bits %08X\n", status_err);
 }
 
-static inline u16 igb_get_hlen(struct igb_adapter *adapter,
+static inline u16 igb_get_hlen(struct igb_ring *rx_ring,
                                union e1000_adv_rx_desc *rx_desc)
 {
 	/* HW will not DMA in data larger than the given buffer, even if it
@@ -4837,8 +4840,8 @@ static inline u16 igb_get_hlen(struct igb_adapter *adapter,
 	 */
 	u16 hlen = (le16_to_cpu(rx_desc->wb.lower.lo_dword.hdr_info) &
 	           E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT;
-	if (hlen > adapter->rx_buffer_len)
-		hlen = adapter->rx_buffer_len;
+	if (hlen > rx_ring->rx_buffer_len)
+		hlen = rx_ring->rx_buffer_len;
 	return hlen;
 }
 
@@ -4888,14 +4891,14 @@ static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector,
 
 		if (buffer_info->dma) {
 			pci_unmap_single(pdev, buffer_info->dma,
-					 adapter->rx_buffer_len,
+					 rx_ring->rx_buffer_len,
 					 PCI_DMA_FROMDEVICE);
 			buffer_info->dma = 0;
-			if (adapter->rx_buffer_len >= IGB_RXBUFFER_1024) {
+			if (rx_ring->rx_buffer_len >= IGB_RXBUFFER_1024) {
 				skb_put(skb, length);
 				goto send_up;
 			}
-			skb_put(skb, igb_get_hlen(adapter, rx_desc));
+			skb_put(skb, igb_get_hlen(rx_ring, rx_desc));
 		}
 
 		if (length) {
@@ -5034,7 +5037,7 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 	i = rx_ring->next_to_use;
 	buffer_info = &rx_ring->buffer_info[i];
 
-	bufsz = adapter->rx_buffer_len;
+	bufsz = rx_ring->rx_buffer_len;
 
 	while (cleaned_count--) {
 		rx_desc = E1000_RX_DESC_ADV(*rx_ring, i);


^ permalink raw reply related

* [net-next-2.6 PATCH 08/20] igb: add pci device pointer to ring structure
From: Jeff Kirsher @ 2009-10-28  1:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds a pci device pointer to the ring structure.  The main use of
this pointer is for memory mapping/unmapping of the rings.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h         |    5 ++-
 drivers/net/igb/igb_ethtool.c |    4 +-
 drivers/net/igb/igb_main.c    |   72 ++++++++++++++++++++---------------------
 3 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index e52fee4..de26862 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -173,6 +173,7 @@ struct igb_q_vector {
 struct igb_ring {
 	struct igb_q_vector *q_vector; /* backlink to q_vector */
 	void *desc;                    /* descriptor ring memory */
+	struct pci_dev *pdev;          /* pci device for dma mapping */
 	dma_addr_t dma;                /* phys address of the ring */
 	unsigned int size;             /* length of desc. ring in bytes */
 	unsigned int count;            /* number of desc. in the ring */
@@ -325,8 +326,8 @@ extern void igb_down(struct igb_adapter *);
 extern void igb_reinit_locked(struct igb_adapter *);
 extern void igb_reset(struct igb_adapter *);
 extern int igb_set_spd_dplx(struct igb_adapter *, u16);
-extern int igb_setup_tx_resources(struct igb_adapter *, struct igb_ring *);
-extern int igb_setup_rx_resources(struct igb_adapter *, struct igb_ring *);
+extern int igb_setup_tx_resources(struct igb_ring *);
+extern int igb_setup_rx_resources(struct igb_ring *);
 extern void igb_free_tx_resources(struct igb_ring *);
 extern void igb_free_rx_resources(struct igb_ring *);
 extern void igb_update_stats(struct igb_adapter *);
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index 2929546..c48a555 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -794,7 +794,7 @@ static int igb_set_ringparam(struct net_device *netdev,
 
 		for (i = 0; i < adapter->num_tx_queues; i++) {
 			temp_ring[i].count = new_tx_count;
-			err = igb_setup_tx_resources(adapter, &temp_ring[i]);
+			err = igb_setup_tx_resources(&temp_ring[i]);
 			if (err) {
 				while (i) {
 					i--;
@@ -819,7 +819,7 @@ static int igb_set_ringparam(struct net_device *netdev,
 
 		for (i = 0; i < adapter->num_rx_queues; i++) {
 			temp_ring[i].count = new_rx_count;
-			err = igb_setup_rx_resources(adapter, &temp_ring[i]);
+			err = igb_setup_rx_resources(&temp_ring[i]);
 			if (err) {
 				while (i) {
 					i--;
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 2728f93..ff16b7a 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -436,11 +436,13 @@ static int igb_alloc_queues(struct igb_adapter *adapter)
 		struct igb_ring *ring = &(adapter->tx_ring[i]);
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = i;
+		ring->pdev = adapter->pdev;
 	}
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		struct igb_ring *ring = &(adapter->rx_ring[i]);
 		ring->count = adapter->rx_ring_count;
 		ring->queue_index = i;
+		ring->pdev = adapter->pdev;
 	}
 
 	igb_cache_ring_register(adapter);
@@ -2002,15 +2004,13 @@ static int igb_close(struct net_device *netdev)
 
 /**
  * igb_setup_tx_resources - allocate Tx resources (Descriptors)
- * @adapter: board private structure
  * @tx_ring: tx descriptor ring (for a specific queue) to setup
  *
  * Return 0 on success, negative on failure
  **/
-int igb_setup_tx_resources(struct igb_adapter *adapter,
-			   struct igb_ring *tx_ring)
+int igb_setup_tx_resources(struct igb_ring *tx_ring)
 {
-	struct pci_dev *pdev = adapter->pdev;
+	struct pci_dev *pdev = tx_ring->pdev;
 	int size;
 
 	size = sizeof(struct igb_buffer) * tx_ring->count;
@@ -2053,7 +2053,7 @@ static int igb_setup_all_tx_resources(struct igb_adapter *adapter)
 	int r_idx;
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
-		err = igb_setup_tx_resources(adapter, &adapter->tx_ring[i]);
+		err = igb_setup_tx_resources(&adapter->tx_ring[i]);
 		if (err) {
 			dev_err(&adapter->pdev->dev,
 				"Allocation for Tx Queue %u failed\n", i);
@@ -2156,15 +2156,13 @@ static void igb_configure_tx(struct igb_adapter *adapter)
 
 /**
  * igb_setup_rx_resources - allocate Rx resources (Descriptors)
- * @adapter: board private structure
  * @rx_ring:    rx descriptor ring (for a specific queue) to setup
  *
  * Returns 0 on success, negative on failure
  **/
-int igb_setup_rx_resources(struct igb_adapter *adapter,
-			   struct igb_ring *rx_ring)
+int igb_setup_rx_resources(struct igb_ring *rx_ring)
 {
-	struct pci_dev *pdev = adapter->pdev;
+	struct pci_dev *pdev = rx_ring->pdev;
 	int size, desc_len;
 
 	size = sizeof(struct igb_buffer) * rx_ring->count;
@@ -2192,7 +2190,7 @@ int igb_setup_rx_resources(struct igb_adapter *adapter,
 
 err:
 	vfree(rx_ring->buffer_info);
-	dev_err(&adapter->pdev->dev, "Unable to allocate memory for "
+	dev_err(&pdev->dev, "Unable to allocate memory for "
 		"the receive descriptor ring\n");
 	return -ENOMEM;
 }
@@ -2209,7 +2207,7 @@ static int igb_setup_all_rx_resources(struct igb_adapter *adapter)
 	int i, err = 0;
 
 	for (i = 0; i < adapter->num_rx_queues; i++) {
-		err = igb_setup_rx_resources(adapter, &adapter->rx_ring[i]);
+		err = igb_setup_rx_resources(&adapter->rx_ring[i]);
 		if (err) {
 			dev_err(&adapter->pdev->dev,
 				"Allocation for Rx Queue %u failed\n", i);
@@ -2497,14 +2495,13 @@ static void igb_configure_rx(struct igb_adapter *adapter)
  **/
 void igb_free_tx_resources(struct igb_ring *tx_ring)
 {
-	struct pci_dev *pdev = tx_ring->q_vector->adapter->pdev;
-
 	igb_clean_tx_ring(tx_ring);
 
 	vfree(tx_ring->buffer_info);
 	tx_ring->buffer_info = NULL;
 
-	pci_free_consistent(pdev, tx_ring->size, tx_ring->desc, tx_ring->dma);
+	pci_free_consistent(tx_ring->pdev, tx_ring->size,
+	                    tx_ring->desc, tx_ring->dma);
 
 	tx_ring->desc = NULL;
 }
@@ -2523,12 +2520,13 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter)
 		igb_free_tx_resources(&adapter->tx_ring[i]);
 }
 
-static void igb_unmap_and_free_tx_resource(struct igb_adapter *adapter,
+static void igb_unmap_and_free_tx_resource(struct igb_ring *tx_ring,
 					   struct igb_buffer *buffer_info)
 {
 	buffer_info->dma = 0;
 	if (buffer_info->skb) {
-		skb_dma_unmap(&adapter->pdev->dev, buffer_info->skb,
+		skb_dma_unmap(&tx_ring->pdev->dev,
+		              buffer_info->skb,
 		              DMA_TO_DEVICE);
 		dev_kfree_skb_any(buffer_info->skb);
 		buffer_info->skb = NULL;
@@ -2543,7 +2541,6 @@ static void igb_unmap_and_free_tx_resource(struct igb_adapter *adapter,
  **/
 static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 {
-	struct igb_adapter *adapter = tx_ring->q_vector->adapter;
 	struct igb_buffer *buffer_info;
 	unsigned long size;
 	unsigned int i;
@@ -2554,7 +2551,7 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 
 	for (i = 0; i < tx_ring->count; i++) {
 		buffer_info = &tx_ring->buffer_info[i];
-		igb_unmap_and_free_tx_resource(adapter, buffer_info);
+		igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 	}
 
 	size = sizeof(struct igb_buffer) * tx_ring->count;
@@ -2591,14 +2588,13 @@ static void igb_clean_all_tx_rings(struct igb_adapter *adapter)
  **/
 void igb_free_rx_resources(struct igb_ring *rx_ring)
 {
-	struct pci_dev *pdev = rx_ring->q_vector->adapter->pdev;
-
 	igb_clean_rx_ring(rx_ring);
 
 	vfree(rx_ring->buffer_info);
 	rx_ring->buffer_info = NULL;
 
-	pci_free_consistent(pdev, rx_ring->size, rx_ring->desc, rx_ring->dma);
+	pci_free_consistent(rx_ring->pdev, rx_ring->size,
+	                    rx_ring->desc, rx_ring->dma);
 
 	rx_ring->desc = NULL;
 }
@@ -2625,7 +2621,6 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 {
 	struct igb_adapter *adapter = rx_ring->q_vector->adapter;
 	struct igb_buffer *buffer_info;
-	struct pci_dev *pdev = adapter->pdev;
 	unsigned long size;
 	unsigned int i;
 
@@ -2635,7 +2630,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	for (i = 0; i < rx_ring->count; i++) {
 		buffer_info = &rx_ring->buffer_info[i];
 		if (buffer_info->dma) {
-			pci_unmap_single(pdev, buffer_info->dma,
+			pci_unmap_single(rx_ring->pdev,
+			                 buffer_info->dma,
 					 adapter->rx_buffer_len,
 					 PCI_DMA_FROMDEVICE);
 			buffer_info->dma = 0;
@@ -2646,7 +2642,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 			buffer_info->skb = NULL;
 		}
 		if (buffer_info->page_dma) {
-			pci_unmap_page(pdev, buffer_info->page_dma,
+			pci_unmap_page(rx_ring->pdev,
+			               buffer_info->page_dma,
 				       PAGE_SIZE / 2,
 				       PCI_DMA_FROMDEVICE);
 			buffer_info->page_dma = 0;
@@ -3362,9 +3359,10 @@ static inline bool igb_tx_csum_adv(struct igb_adapter *adapter,
 					struct sk_buff *skb, u32 tx_flags)
 {
 	struct e1000_adv_tx_context_desc *context_desc;
-	unsigned int i;
+	struct pci_dev *pdev = tx_ring->pdev;
 	struct igb_buffer *buffer_info;
 	u32 info = 0, tu_cmd = 0;
+	unsigned int i;
 
 	if ((skb->ip_summed == CHECKSUM_PARTIAL) ||
 	    (tx_flags & IGB_TX_FLAGS_VLAN)) {
@@ -3411,7 +3409,7 @@ static inline bool igb_tx_csum_adv(struct igb_adapter *adapter,
 				break;
 			default:
 				if (unlikely(net_ratelimit()))
-					dev_warn(&adapter->pdev->dev,
+					dev_warn(&pdev->dev,
 					    "partial checksum but proto=%x!\n",
 					    skb->protocol);
 				break;
@@ -3443,11 +3441,11 @@ static inline bool igb_tx_csum_adv(struct igb_adapter *adapter,
 #define IGB_MAX_TXD_PWR	16
 #define IGB_MAX_DATA_PER_TXD	(1<<IGB_MAX_TXD_PWR)
 
-static inline int igb_tx_map_adv(struct igb_adapter *adapter,
-				 struct igb_ring *tx_ring, struct sk_buff *skb,
+static inline int igb_tx_map_adv(struct igb_ring *tx_ring, struct sk_buff *skb,
 				 unsigned int first)
 {
 	struct igb_buffer *buffer_info;
+	struct pci_dev *pdev = tx_ring->pdev;
 	unsigned int len = skb_headlen(skb);
 	unsigned int count = 0, i;
 	unsigned int f;
@@ -3455,8 +3453,8 @@ static inline int igb_tx_map_adv(struct igb_adapter *adapter,
 
 	i = tx_ring->next_to_use;
 
-	if (skb_dma_map(&adapter->pdev->dev, skb, DMA_TO_DEVICE)) {
-		dev_err(&adapter->pdev->dev, "TX DMA map failed\n");
+	if (skb_dma_map(&pdev->dev, skb, DMA_TO_DEVICE)) {
+		dev_err(&pdev->dev, "TX DMA map failed\n");
 		return 0;
 	}
 
@@ -3667,7 +3665,7 @@ static netdev_tx_t igb_xmit_frame_ring_adv(struct sk_buff *skb,
 	 * count reflects descriptors mapped, if 0 then mapping error
 	 * has occured and we need to rewind the descriptor queue
 	 */
-	count = igb_tx_map_adv(adapter, tx_ring, skb, first);
+	count = igb_tx_map_adv(tx_ring, skb, first);
 
 	if (count) {
 		igb_tx_queue_adv(adapter, tx_ring, tx_flags, count,
@@ -4710,7 +4708,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				igb_tx_hwtstamp(adapter, skb);
 			}
 
-			igb_unmap_and_free_tx_resource(adapter, buffer_info);
+			igb_unmap_and_free_tx_resource(tx_ring, buffer_info);
 			tx_desc->wb.status = 0;
 
 			i++;
@@ -4748,7 +4746,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 			 E1000_STATUS_TXOFF)) {
 
 			/* detected Tx unit hang */
-			dev_err(&adapter->pdev->dev,
+			dev_err(&tx_ring->pdev->dev,
 				"Detected Tx Unit Hang\n"
 				"  Tx Queue             <%d>\n"
 				"  TDH                  <%x>\n"
@@ -4851,7 +4849,7 @@ static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector,
 	struct net_device *netdev = adapter->netdev;
 	struct igb_ring *rx_ring = q_vector->rx_ring;
 	struct e1000_hw *hw = &adapter->hw;
-	struct pci_dev *pdev = adapter->pdev;
+	struct pci_dev *pdev = rx_ring->pdev;
 	union e1000_adv_rx_desc *rx_desc , *next_rxd;
 	struct igb_buffer *buffer_info , *next_buffer;
 	struct sk_buff *skb;
@@ -5027,7 +5025,6 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 {
 	struct igb_adapter *adapter = rx_ring->q_vector->adapter;
 	struct net_device *netdev = adapter->netdev;
-	struct pci_dev *pdev = adapter->pdev;
 	union e1000_adv_rx_desc *rx_desc;
 	struct igb_buffer *buffer_info;
 	struct sk_buff *skb;
@@ -5054,7 +5051,7 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 				buffer_info->page_offset ^= PAGE_SIZE / 2;
 			}
 			buffer_info->page_dma =
-				pci_map_page(pdev, buffer_info->page,
+				pci_map_page(rx_ring->pdev, buffer_info->page,
 					     buffer_info->page_offset,
 					     PAGE_SIZE / 2,
 					     PCI_DMA_FROMDEVICE);
@@ -5068,7 +5065,8 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 			}
 
 			buffer_info->skb = skb;
-			buffer_info->dma = pci_map_single(pdev, skb->data,
+			buffer_info->dma = pci_map_single(rx_ring->pdev,
+			                                  skb->data,
 							  bufsz,
 							  PCI_DMA_FROMDEVICE);
 		}


^ permalink raw reply related

* [net-next-2.6 PATCH 07/20] igb: change the head and tail offsets into pointers
From: Jeff Kirsher @ 2009-10-28  1:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Since we are writting to the head/tail pointers frequently we might as well
save ourselves some processing time by converting the head and tail offsets
directly to pointers.  This will shave a few cycles off the rx/tx path and
allows us to move one step closer to the rings being a bit more independant of
each other.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    4 ++--
 drivers/net/igb/igb_main.c |   32 ++++++++++++++++----------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 303df02..e52fee4 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -178,8 +178,8 @@ struct igb_ring {
 	unsigned int count;            /* number of desc. in the ring */
 	u16 next_to_use;
 	u16 next_to_clean;
-	u16 head;
-	u16 tail;
+	void __iomem *head;
+	void __iomem *tail;
 	struct igb_buffer *buffer_info; /* array of buffer info structs */
 
 	u8 queue_index;
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index dfca821..2728f93 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -2124,10 +2124,10 @@ static void igb_configure_tx_ring(struct igb_adapter *adapter,
 	                tdba & 0x00000000ffffffffULL);
 	wr32(E1000_TDBAH(reg_idx), tdba >> 32);
 
-	ring->head = E1000_TDH(reg_idx);
-	ring->tail = E1000_TDT(reg_idx);
-	writel(0, hw->hw_addr + ring->tail);
-	writel(0, hw->hw_addr + ring->head);
+	ring->head = hw->hw_addr + E1000_TDH(reg_idx);
+	ring->tail = hw->hw_addr + E1000_TDT(reg_idx);
+	writel(0, ring->head);
+	writel(0, ring->tail);
 
 	txdctl |= IGB_TX_PTHRESH;
 	txdctl |= IGB_TX_HTHRESH << 8;
@@ -2354,10 +2354,10 @@ static void igb_configure_rx_ring(struct igb_adapter *adapter,
 	               ring->count * sizeof(union e1000_adv_rx_desc));
 
 	/* initialize head and tail */
-	ring->head = E1000_RDH(reg_idx);
-	ring->tail = E1000_RDT(reg_idx);
-	writel(0, hw->hw_addr + ring->head);
-	writel(0, hw->hw_addr + ring->tail);
+	ring->head = hw->hw_addr + E1000_RDH(reg_idx);
+	ring->tail = hw->hw_addr + E1000_RDT(reg_idx);
+	writel(0, ring->head);
+	writel(0, ring->tail);
 
 	/* set descriptor configuration */
 	if (adapter->rx_buffer_len < IGB_RXBUFFER_1024) {
@@ -2567,8 +2567,8 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 
-	writel(0, adapter->hw.hw_addr + tx_ring->head);
-	writel(0, adapter->hw.hw_addr + tx_ring->tail);
+	writel(0, tx_ring->head);
+	writel(0, tx_ring->tail);
 }
 
 /**
@@ -2667,8 +2667,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
-	writel(0, adapter->hw.hw_addr + rx_ring->head);
-	writel(0, adapter->hw.hw_addr + rx_ring->tail);
+	writel(0, rx_ring->head);
+	writel(0, rx_ring->tail);
 }
 
 /**
@@ -3556,7 +3556,7 @@ static inline void igb_tx_queue_adv(struct igb_adapter *adapter,
 	wmb();
 
 	tx_ring->next_to_use = i;
-	writel(i, adapter->hw.hw_addr + tx_ring->tail);
+	writel(i, tx_ring->tail);
 	/* we need this if more than one processor can write to our tail
 	 * at a time, it syncronizes IO on IA64/Altix systems */
 	mmiowb();
@@ -4761,8 +4761,8 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 				"  jiffies              <%lx>\n"
 				"  desc.status          <%x>\n",
 				tx_ring->queue_index,
-				readl(adapter->hw.hw_addr + tx_ring->head),
-				readl(adapter->hw.hw_addr + tx_ring->tail),
+				readl(tx_ring->head),
+				readl(tx_ring->tail),
 				tx_ring->next_to_use,
 				tx_ring->next_to_clean,
 				tx_ring->buffer_info[i].time_stamp,
@@ -5103,7 +5103,7 @@ no_buffers:
 		 * applicable for weak-ordered memory model archs,
 		 * such as IA-64). */
 		wmb();
-		writel(i, adapter->hw.hw_addr + rx_ring->tail);
+		writel(i, rx_ring->tail);
 	}
 }
 


^ permalink raw reply related

* [net-next-2.6 PATCH 06/20] igb: move SRRCTL register configuration into ring specific config
From: Jeff Kirsher @ 2009-10-28  1:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

The SRRCTL register exists per ring.  Instead of configuring all of them in
the RCTL configuration which is meant to be global it makes more sense to move
this out into the ring specific configuration.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |   60 +++++++++++++++++---------------------------
 1 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 24e502d..dfca821 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -2230,8 +2230,6 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	u32 rctl;
-	u32 srrctl = 0;
-	int i;
 
 	rctl = rd32(E1000_RCTL);
 
@@ -2256,31 +2254,8 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 	/* enable LPE to prevent packets larger than max_frame_size */
 	rctl |= E1000_RCTL_LPE;
 
-	/* 82575 and greater support packet-split where the protocol
-	 * header is placed in skb->data and the packet data is
-	 * placed in pages hanging off of skb_shinfo(skb)->nr_frags.
-	 * In the case of a non-split, skb->data is linearly filled,
-	 * followed by the page buffers.  Therefore, skb->data is
-	 * sized to hold the largest protocol header.
-	 */
-	/* allocations using alloc_page take too long for regular MTU
-	 * so only enable packet split for jumbo frames */
-	if (adapter->rx_buffer_len < IGB_RXBUFFER_1024) {
-		srrctl = ALIGN(adapter->rx_buffer_len, 64) <<
-		         E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
-#if (PAGE_SIZE / 2) > IGB_RXBUFFER_16384
-		srrctl |= IGB_RXBUFFER_16384 >>
-		          E1000_SRRCTL_BSIZEPKT_SHIFT;
-#else
-		srrctl |= (PAGE_SIZE / 2) >>
-		          E1000_SRRCTL_BSIZEPKT_SHIFT;
-#endif
-		srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
-	} else {
-		srrctl = ALIGN(adapter->rx_buffer_len, 1024) >>
-		         E1000_SRRCTL_BSIZEPKT_SHIFT;
-		srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
-	}
+	/* disable queue 0 to prevent tail write w/o re-config */
+	wr32(E1000_RXDCTL(0), 0);
 
 	/* Attention!!!  For SR-IOV PF driver operations you must enable
 	 * queue drop for all VF and PF queues to prevent head of line blocking
@@ -2291,10 +2266,6 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 
 		/* set all queue drop enable bits */
 		wr32(E1000_QDE, ALL_QUEUES);
-		srrctl |= E1000_SRRCTL_DROP_EN;
-
-		/* disable queue 0 to prevent tail write w/o re-config */
-		wr32(E1000_RXDCTL(0), 0);
 
 		vmolr = rd32(E1000_VMOLR(adapter->vfs_allocated_count));
 		if (rctl & E1000_RCTL_LPE)
@@ -2304,11 +2275,6 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 		wr32(E1000_VMOLR(adapter->vfs_allocated_count), vmolr);
 	}
 
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		int j = adapter->rx_ring[i].reg_idx;
-		wr32(E1000_SRRCTL(j), srrctl);
-	}
-
 	wr32(E1000_RCTL, rctl);
 }
 
@@ -2373,7 +2339,7 @@ static void igb_configure_rx_ring(struct igb_adapter *adapter,
 	struct e1000_hw *hw = &adapter->hw;
 	u64 rdba = ring->dma;
 	int reg_idx = ring->reg_idx;
-	u32 rxdctl;
+	u32 srrctl, rxdctl;
 
 	/* disable the queue */
 	rxdctl = rd32(E1000_RXDCTL(reg_idx));
@@ -2393,6 +2359,26 @@ static void igb_configure_rx_ring(struct igb_adapter *adapter,
 	writel(0, hw->hw_addr + ring->head);
 	writel(0, hw->hw_addr + ring->tail);
 
+	/* set descriptor configuration */
+	if (adapter->rx_buffer_len < IGB_RXBUFFER_1024) {
+		srrctl = ALIGN(adapter->rx_buffer_len, 64) <<
+		         E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+#if (PAGE_SIZE / 2) > IGB_RXBUFFER_16384
+		srrctl |= IGB_RXBUFFER_16384 >>
+		          E1000_SRRCTL_BSIZEPKT_SHIFT;
+#else
+		srrctl |= (PAGE_SIZE / 2) >>
+		          E1000_SRRCTL_BSIZEPKT_SHIFT;
+#endif
+		srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+	} else {
+		srrctl = ALIGN(adapter->rx_buffer_len, 1024) >>
+		         E1000_SRRCTL_BSIZEPKT_SHIFT;
+		srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
+	}
+
+	wr32(E1000_SRRCTL(reg_idx), srrctl);
+
 	/* enable receive descriptor fetching */
 	rxdctl = rd32(E1000_RXDCTL(reg_idx));
 	rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;


^ permalink raw reply related

* [net-next-2.6 PATCH 05/20] igb: remove rx_ps_hdr_len
From: Jeff Kirsher @ 2009-10-28  1:50 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch removes the rx_ps_hdr_len which isn't really needed since we can
now use rx_buffer_len less than 1K to indicate that we are in a packet split
mode.  We also don't need it since we always use a half page for the data
buffers when receiving so we always know the size to map/unmap.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    1 
 drivers/net/igb/igb_main.c |   98 +++++++++++++++++---------------------------
 2 files changed, 38 insertions(+), 61 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 1675f6a..303df02 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -259,7 +259,6 @@ struct igb_adapter {
 	u32 alloc_rx_buff_failed;
 	u32 gorc;
 	u64 gorc_old;
-	u16 rx_ps_hdr_size;
 	u32 max_frame_size;
 	u32 min_frame_size;
 
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 61ef4c2..24e502d 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -1862,7 +1862,6 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 	adapter->tx_ring_count = IGB_DEFAULT_TXD;
 	adapter->rx_ring_count = IGB_DEFAULT_RXD;
 	adapter->rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
-	adapter->rx_ps_hdr_size = 0; /* disable packet split */
 	adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN;
 	adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN;
 
@@ -2254,12 +2253,8 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 	 */
 	rctl &= ~(E1000_RCTL_SBP | E1000_RCTL_SZ_256);
 
-	/* enable LPE when to prevent packets larger than max_frame_size */
-		rctl |= E1000_RCTL_LPE;
-
-	/* Setup buffer sizes */
-	srrctl = ALIGN(adapter->rx_buffer_len, 1024)
-	         >> E1000_SRRCTL_BSIZEPKT_SHIFT;
+	/* enable LPE to prevent packets larger than max_frame_size */
+	rctl |= E1000_RCTL_LPE;
 
 	/* 82575 and greater support packet-split where the protocol
 	 * header is placed in skb->data and the packet data is
@@ -2270,13 +2265,20 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 	 */
 	/* allocations using alloc_page take too long for regular MTU
 	 * so only enable packet split for jumbo frames */
-	if (adapter->netdev->mtu > ETH_DATA_LEN) {
-		adapter->rx_ps_hdr_size = IGB_RXBUFFER_128;
-		srrctl |= adapter->rx_ps_hdr_size <<
-			 E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+	if (adapter->rx_buffer_len < IGB_RXBUFFER_1024) {
+		srrctl = ALIGN(adapter->rx_buffer_len, 64) <<
+		         E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+#if (PAGE_SIZE / 2) > IGB_RXBUFFER_16384
+		srrctl |= IGB_RXBUFFER_16384 >>
+		          E1000_SRRCTL_BSIZEPKT_SHIFT;
+#else
+		srrctl |= (PAGE_SIZE / 2) >>
+		          E1000_SRRCTL_BSIZEPKT_SHIFT;
+#endif
 		srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
 	} else {
-		adapter->rx_ps_hdr_size = 0;
+		srrctl = ALIGN(adapter->rx_buffer_len, 1024) >>
+		         E1000_SRRCTL_BSIZEPKT_SHIFT;
 		srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
 	}
 
@@ -2647,14 +2649,9 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	for (i = 0; i < rx_ring->count; i++) {
 		buffer_info = &rx_ring->buffer_info[i];
 		if (buffer_info->dma) {
-			if (adapter->rx_ps_hdr_size)
-				pci_unmap_single(pdev, buffer_info->dma,
-						 adapter->rx_ps_hdr_size,
-						 PCI_DMA_FROMDEVICE);
-			else
-				pci_unmap_single(pdev, buffer_info->dma,
-						 adapter->rx_buffer_len,
-						 PCI_DMA_FROMDEVICE);
+			pci_unmap_single(pdev, buffer_info->dma,
+					 adapter->rx_buffer_len,
+					 PCI_DMA_FROMDEVICE);
 			buffer_info->dma = 0;
 		}
 
@@ -2662,14 +2659,15 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 			dev_kfree_skb(buffer_info->skb);
 			buffer_info->skb = NULL;
 		}
+		if (buffer_info->page_dma) {
+			pci_unmap_page(pdev, buffer_info->page_dma,
+				       PAGE_SIZE / 2,
+				       PCI_DMA_FROMDEVICE);
+			buffer_info->page_dma = 0;
+		}
 		if (buffer_info->page) {
-			if (buffer_info->page_dma)
-				pci_unmap_page(pdev, buffer_info->page_dma,
-					       PAGE_SIZE / 2,
-					       PCI_DMA_FROMDEVICE);
 			put_page(buffer_info->page);
 			buffer_info->page = NULL;
-			buffer_info->page_dma = 0;
 			buffer_info->page_offset = 0;
 		}
 	}
@@ -3792,19 +3790,10 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 
 	if (max_frame <= IGB_RXBUFFER_1024)
 		adapter->rx_buffer_len = IGB_RXBUFFER_1024;
-	else if (max_frame <= IGB_RXBUFFER_2048)
-		adapter->rx_buffer_len = IGB_RXBUFFER_2048;
-	else
-#if (PAGE_SIZE / 2) > IGB_RXBUFFER_16384
-		adapter->rx_buffer_len = IGB_RXBUFFER_16384;
-#else
-		adapter->rx_buffer_len = PAGE_SIZE / 2;
-#endif
-
-	/* adjust allocation if LPE protects us, and we aren't using SBP */
-	if ((max_frame == ETH_FRAME_LEN + ETH_FCS_LEN) ||
-	     (max_frame == MAXIMUM_ETHERNET_VLAN_SIZE))
+	else if (max_frame <= MAXIMUM_ETHERNET_VLAN_SIZE)
 		adapter->rx_buffer_len = MAXIMUM_ETHERNET_VLAN_SIZE;
+	else
+		adapter->rx_buffer_len = IGB_RXBUFFER_128;
 
 	dev_info(&adapter->pdev->dev, "changing MTU from %d to %d\n",
 		 netdev->mtu, new_mtu);
@@ -4864,8 +4853,8 @@ static inline u16 igb_get_hlen(struct igb_adapter *adapter,
 	 */
 	u16 hlen = (le16_to_cpu(rx_desc->wb.lower.lo_dword.hdr_info) &
 	           E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT;
-	if (hlen > adapter->rx_ps_hdr_size)
-		hlen = adapter->rx_ps_hdr_size;
+	if (hlen > adapter->rx_buffer_len)
+		hlen = adapter->rx_buffer_len;
 	return hlen;
 }
 
@@ -4913,23 +4902,16 @@ static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector,
 		cleaned = true;
 		cleaned_count++;
 
-		/* this is the fast path for the non-packet split case */
-		if (!adapter->rx_ps_hdr_size) {
-			pci_unmap_single(pdev, buffer_info->dma,
-					 adapter->rx_buffer_len,
-					 PCI_DMA_FROMDEVICE);
-			buffer_info->dma = 0;
-			skb_put(skb, length);
-			goto send_up;
-		}
-
 		if (buffer_info->dma) {
-			u16 hlen = igb_get_hlen(adapter, rx_desc);
 			pci_unmap_single(pdev, buffer_info->dma,
-					 adapter->rx_ps_hdr_size,
+					 adapter->rx_buffer_len,
 					 PCI_DMA_FROMDEVICE);
 			buffer_info->dma = 0;
-			skb_put(skb, hlen);
+			if (adapter->rx_buffer_len >= IGB_RXBUFFER_1024) {
+				skb_put(skb, length);
+				goto send_up;
+			}
+			skb_put(skb, igb_get_hlen(adapter, rx_desc));
 		}
 
 		if (length) {
@@ -4942,8 +4924,7 @@ static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector,
 						buffer_info->page_offset,
 						length);
 
-			if ((adapter->rx_buffer_len > (PAGE_SIZE / 2)) ||
-			    (page_count(buffer_info->page) != 1))
+			if (page_count(buffer_info->page) != 1)
 				buffer_info->page = NULL;
 			else
 				get_page(buffer_info->page);
@@ -5070,15 +5051,12 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 	i = rx_ring->next_to_use;
 	buffer_info = &rx_ring->buffer_info[i];
 
-	if (adapter->rx_ps_hdr_size)
-		bufsz = adapter->rx_ps_hdr_size;
-	else
-		bufsz = adapter->rx_buffer_len;
+	bufsz = adapter->rx_buffer_len;
 
 	while (cleaned_count--) {
 		rx_desc = E1000_RX_DESC_ADV(*rx_ring, i);
 
-		if (adapter->rx_ps_hdr_size && !buffer_info->page_dma) {
+		if ((bufsz < IGB_RXBUFFER_1024) && !buffer_info->page_dma) {
 			if (!buffer_info->page) {
 				buffer_info->page = alloc_page(GFP_ATOMIC);
 				if (!buffer_info->page) {
@@ -5110,7 +5088,7 @@ static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 		}
 		/* Refresh the desc even if buffer_addrs didn't change because
 		 * each write-back erases this info. */
-		if (adapter->rx_ps_hdr_size) {
+		if (bufsz < IGB_RXBUFFER_1024) {
 			rx_desc->read.pkt_addr =
 			     cpu_to_le64(buffer_info->page_dma);
 			rx_desc->read.hdr_addr = cpu_to_le64(buffer_info->dma);


^ permalink raw reply related

* [net-next-2.6 PATCH 04/20] igb: move the tx and rx ring specific config into seperate functions
From: Jeff Kirsher @ 2009-10-28  1:50 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change makes the tx and rx config a bit cleaner by breaking out the ring
specific configuration from the generic rx and tx configuration.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    6 +
 drivers/net/igb/igb_main.c |  179 ++++++++++++++++++++++++++++----------------
 2 files changed, 117 insertions(+), 68 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 044ba02..1675f6a 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -87,9 +87,13 @@ struct vf_data_storage {
  *           descriptors until either it has this many to write back, or the
  *           ITR timer expires.
  */
-#define IGB_RX_PTHRESH                    16
+#define IGB_RX_PTHRESH                    (hw->mac.type <= e1000_82576 ? 16 : 8)
 #define IGB_RX_HTHRESH                     8
 #define IGB_RX_WTHRESH                     1
+#define IGB_TX_PTHRESH                     8
+#define IGB_TX_HTHRESH                     1
+#define IGB_TX_WTHRESH                     ((hw->mac.type == e1000_82576 && \
+                                             adapter->msix_entries) ? 0 : 16)
 
 /* this is the size past which hardware will drop packets when setting LPE=0 */
 #define MAXIMUM_ETHERNET_VLAN_SIZE 1522
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 6146f5d..61ef4c2 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -90,6 +90,7 @@ static int igb_open(struct net_device *);
 static int igb_close(struct net_device *);
 static void igb_configure_tx(struct igb_adapter *);
 static void igb_configure_rx(struct igb_adapter *);
+static void igb_setup_tctl(struct igb_adapter *);
 static void igb_setup_rctl(struct igb_adapter *);
 static void igb_clean_all_tx_rings(struct igb_adapter *);
 static void igb_clean_all_rx_rings(struct igb_adapter *);
@@ -1101,8 +1102,10 @@ static void igb_configure(struct igb_adapter *adapter)
 
 	igb_restore_vlan(adapter);
 
-	igb_configure_tx(adapter);
+	igb_setup_tctl(adapter);
 	igb_setup_rctl(adapter);
+
+	igb_configure_tx(adapter);
 	igb_configure_rx(adapter);
 
 	igb_rx_fifo_flush_82575(&adapter->hw);
@@ -2069,49 +2072,16 @@ static int igb_setup_all_tx_resources(struct igb_adapter *adapter)
 }
 
 /**
- * igb_configure_tx - Configure transmit Unit after Reset
- * @adapter: board private structure
- *
- * Configure the Tx unit of the MAC after a reset.
+ * igb_setup_tctl - configure the transmit control registers
+ * @adapter: Board private structure
  **/
-static void igb_configure_tx(struct igb_adapter *adapter)
+static void igb_setup_tctl(struct igb_adapter *adapter)
 {
-	u64 tdba;
 	struct e1000_hw *hw = &adapter->hw;
 	u32 tctl;
-	u32 txdctl, txctrl;
-	int i, j;
-
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct igb_ring *ring = &adapter->tx_ring[i];
-		j = ring->reg_idx;
-		wr32(E1000_TDLEN(j),
-		     ring->count * sizeof(union e1000_adv_tx_desc));
-		tdba = ring->dma;
-		wr32(E1000_TDBAL(j),
-		     tdba & 0x00000000ffffffffULL);
-		wr32(E1000_TDBAH(j), tdba >> 32);
-
-		ring->head = E1000_TDH(j);
-		ring->tail = E1000_TDT(j);
-		writel(0, hw->hw_addr + ring->tail);
-		writel(0, hw->hw_addr + ring->head);
-		txdctl = rd32(E1000_TXDCTL(j));
-		txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
-		wr32(E1000_TXDCTL(j), txdctl);
-
-		/* Turn off Relaxed Ordering on head write-backs.  The
-		 * writebacks MUST be delivered in order or it will
-		 * completely screw up our bookeeping.
-		 */
-		txctrl = rd32(E1000_DCA_TXCTRL(j));
-		txctrl &= ~E1000_DCA_TXCTRL_TX_WB_RO_EN;
-		wr32(E1000_DCA_TXCTRL(j), txctrl);
-	}
 
-	/* disable queue 0 to prevent tail bump w/o re-configuration */
-	if (adapter->vfs_allocated_count)
-		wr32(E1000_TXDCTL(0), 0);
+	/* disable queue 0 which is enabled by default on 82575 and 82576 */
+	wr32(E1000_TXDCTL(0), 0);
 
 	/* Program the Transmit Control Register */
 	tctl = rd32(E1000_TCTL);
@@ -2121,9 +2091,6 @@ static void igb_configure_tx(struct igb_adapter *adapter)
 
 	igb_config_collision_dist(hw);
 
-	/* Setup Transmit Descriptor Settings for eop descriptor */
-	adapter->txd_cmd = E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS;
-
 	/* Enable transmits */
 	tctl |= E1000_TCTL_EN;
 
@@ -2131,6 +2098,64 @@ static void igb_configure_tx(struct igb_adapter *adapter)
 }
 
 /**
+ * igb_configure_tx_ring - Configure transmit ring after Reset
+ * @adapter: board private structure
+ * @ring: tx ring to configure
+ *
+ * Configure a transmit ring after a reset.
+ **/
+static void igb_configure_tx_ring(struct igb_adapter *adapter,
+                                  struct igb_ring *ring)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	u32 txdctl;
+	u64 tdba = ring->dma;
+	int reg_idx = ring->reg_idx;
+
+	/* disable the queue */
+	txdctl = rd32(E1000_TXDCTL(reg_idx));
+	wr32(E1000_TXDCTL(reg_idx),
+	                txdctl & ~E1000_TXDCTL_QUEUE_ENABLE);
+	wrfl();
+	mdelay(10);
+
+	wr32(E1000_TDLEN(reg_idx),
+	                ring->count * sizeof(union e1000_adv_tx_desc));
+	wr32(E1000_TDBAL(reg_idx),
+	                tdba & 0x00000000ffffffffULL);
+	wr32(E1000_TDBAH(reg_idx), tdba >> 32);
+
+	ring->head = E1000_TDH(reg_idx);
+	ring->tail = E1000_TDT(reg_idx);
+	writel(0, hw->hw_addr + ring->tail);
+	writel(0, hw->hw_addr + ring->head);
+
+	txdctl |= IGB_TX_PTHRESH;
+	txdctl |= IGB_TX_HTHRESH << 8;
+	txdctl |= IGB_TX_WTHRESH << 16;
+
+	txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
+	wr32(E1000_TXDCTL(reg_idx), txdctl);
+}
+
+/**
+ * igb_configure_tx - Configure transmit Unit after Reset
+ * @adapter: board private structure
+ *
+ * Configure the Tx unit of the MAC after a reset.
+ **/
+static void igb_configure_tx(struct igb_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_tx_queues; i++)
+		igb_configure_tx_ring(adapter, &adapter->tx_ring[i]);
+
+	/* Setup Transmit Descriptor Settings for eop descriptor */
+	adapter->txd_cmd = E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS;
+}
+
+/**
  * igb_setup_rx_resources - allocate Rx resources (Descriptors)
  * @adapter: board private structure
  * @rx_ring:    rx descriptor ring (for a specific queue) to setup
@@ -2334,6 +2359,49 @@ static void igb_configure_vt_default_pool(struct igb_adapter *adapter)
 }
 
 /**
+ * igb_configure_rx_ring - Configure a receive ring after Reset
+ * @adapter: board private structure
+ * @ring: receive ring to be configured
+ *
+ * Configure the Rx unit of the MAC after a reset.
+ **/
+static void igb_configure_rx_ring(struct igb_adapter *adapter,
+                                  struct igb_ring *ring)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	u64 rdba = ring->dma;
+	int reg_idx = ring->reg_idx;
+	u32 rxdctl;
+
+	/* disable the queue */
+	rxdctl = rd32(E1000_RXDCTL(reg_idx));
+	wr32(E1000_RXDCTL(reg_idx),
+	                rxdctl & ~E1000_RXDCTL_QUEUE_ENABLE);
+
+	/* Set DMA base address registers */
+	wr32(E1000_RDBAL(reg_idx),
+	     rdba & 0x00000000ffffffffULL);
+	wr32(E1000_RDBAH(reg_idx), rdba >> 32);
+	wr32(E1000_RDLEN(reg_idx),
+	               ring->count * sizeof(union e1000_adv_rx_desc));
+
+	/* initialize head and tail */
+	ring->head = E1000_RDH(reg_idx);
+	ring->tail = E1000_RDT(reg_idx);
+	writel(0, hw->hw_addr + ring->head);
+	writel(0, hw->hw_addr + ring->tail);
+
+	/* enable receive descriptor fetching */
+	rxdctl = rd32(E1000_RXDCTL(reg_idx));
+	rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
+	rxdctl &= 0xFFF00000;
+	rxdctl |= IGB_RX_PTHRESH;
+	rxdctl |= IGB_RX_HTHRESH << 8;
+	rxdctl |= IGB_RX_WTHRESH << 16;
+	wr32(E1000_RXDCTL(reg_idx), rxdctl);
+}
+
+/**
  * igb_configure_rx - Configure receive Unit after Reset
  * @adapter: board private structure
  *
@@ -2341,10 +2409,8 @@ static void igb_configure_vt_default_pool(struct igb_adapter *adapter)
  **/
 static void igb_configure_rx(struct igb_adapter *adapter)
 {
-	u64 rdba;
 	struct e1000_hw *hw = &adapter->hw;
 	u32 rctl, rxcsum;
-	u32 rxdctl;
 	int i;
 
 	/* disable receives while setting up the descriptors */
@@ -2358,29 +2424,8 @@ static void igb_configure_rx(struct igb_adapter *adapter)
 
 	/* Setup the HW Rx Head and Tail Descriptor Pointers and
 	 * the Base and Length of the Rx Descriptor Ring */
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct igb_ring *ring = &adapter->rx_ring[i];
-		int j = ring->reg_idx;
-		rdba = ring->dma;
-		wr32(E1000_RDBAL(j),
-		     rdba & 0x00000000ffffffffULL);
-		wr32(E1000_RDBAH(j), rdba >> 32);
-		wr32(E1000_RDLEN(j),
-		     ring->count * sizeof(union e1000_adv_rx_desc));
-
-		ring->head = E1000_RDH(j);
-		ring->tail = E1000_RDT(j);
-		writel(0, hw->hw_addr + ring->tail);
-		writel(0, hw->hw_addr + ring->head);
-
-		rxdctl = rd32(E1000_RXDCTL(j));
-		rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
-		rxdctl &= 0xFFF00000;
-		rxdctl |= IGB_RX_PTHRESH;
-		rxdctl |= IGB_RX_HTHRESH << 8;
-		rxdctl |= IGB_RX_WTHRESH << 16;
-		wr32(E1000_RXDCTL(j), rxdctl);
-	}
+	for (i = 0; i < adapter->num_rx_queues; i++)
+		igb_configure_rx_ring(adapter, &adapter->rx_ring[i]);
 
 	if (adapter->num_rx_queues > 1) {
 		u32 random[10];


^ permalink raw reply related

* [net-next-2.6 PATCH 03/20] igb: increase minimum rx buffer size to 1K
From: Jeff Kirsher @ 2009-10-28  1:50 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This update increases the minimum rx buffer size to 1K.  The reason for this
change is to support SR-IOV and avoid any conflicts with the rings being able
to set their own MTU sizes.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    2 --
 drivers/net/igb/igb_main.c |   25 +++----------------------
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index d27dcd1..044ba02 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -96,8 +96,6 @@ struct vf_data_storage {
 
 /* Supported Rx Buffer Sizes */
 #define IGB_RXBUFFER_128   128    /* Used for packet split */
-#define IGB_RXBUFFER_256   256    /* Used for packet split */
-#define IGB_RXBUFFER_512   512
 #define IGB_RXBUFFER_1024  1024
 #define IGB_RXBUFFER_2048  2048
 #define IGB_RXBUFFER_16384 16384
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index e1d1c0c..6146f5d 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -2233,18 +2233,8 @@ static void igb_setup_rctl(struct igb_adapter *adapter)
 		rctl |= E1000_RCTL_LPE;
 
 	/* Setup buffer sizes */
-	switch (adapter->rx_buffer_len) {
-	case IGB_RXBUFFER_256:
-		rctl |= E1000_RCTL_SZ_256;
-		break;
-	case IGB_RXBUFFER_512:
-		rctl |= E1000_RCTL_SZ_512;
-		break;
-	default:
-		srrctl = ALIGN(adapter->rx_buffer_len, 1024)
-		         >> E1000_SRRCTL_BSIZEPKT_SHIFT;
-		break;
-	}
+	srrctl = ALIGN(adapter->rx_buffer_len, 1024)
+	         >> E1000_SRRCTL_BSIZEPKT_SHIFT;
 
 	/* 82575 and greater support packet-split where the protocol
 	 * header is placed in skb->data and the packet data is
@@ -3755,11 +3745,7 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 	 * i.e. RXBUFFER_2048 --> size-4096 slab
 	 */
 
-	if (max_frame <= IGB_RXBUFFER_256)
-		adapter->rx_buffer_len = IGB_RXBUFFER_256;
-	else if (max_frame <= IGB_RXBUFFER_512)
-		adapter->rx_buffer_len = IGB_RXBUFFER_512;
-	else if (max_frame <= IGB_RXBUFFER_1024)
+	if (max_frame <= IGB_RXBUFFER_1024)
 		adapter->rx_buffer_len = IGB_RXBUFFER_1024;
 	else if (max_frame <= IGB_RXBUFFER_2048)
 		adapter->rx_buffer_len = IGB_RXBUFFER_2048;
@@ -3770,11 +3756,6 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)
 		adapter->rx_buffer_len = PAGE_SIZE / 2;
 #endif
 
-	/* if sr-iov is enabled we need to force buffer size to 1K or larger */
-	if (adapter->vfs_allocated_count &&
-	    (adapter->rx_buffer_len < IGB_RXBUFFER_1024))
-		adapter->rx_buffer_len = IGB_RXBUFFER_1024;
-
 	/* adjust allocation if LPE protects us, and we aren't using SBP */
 	if ((max_frame == ETH_FRAME_LEN + ETH_FCS_LEN) ||
 	     (max_frame == MAXIMUM_ETHERNET_VLAN_SIZE))


^ permalink raw reply related

* [net-next-2.6 PATCH 02/20] igb: remove rx checksum good counter
From: Jeff Kirsher @ 2009-10-28  1:49 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20091028014858.12470.99520.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Counting packets with a good checksum can cause a significant amount of cache
line bouncing due to the shared counter being written to by all of the queues.
In order to avoid this I am removing the counter since we still have the
checksum failed counter which will tell us if there are any issues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h         |    1 -
 drivers/net/igb/igb_ethtool.c |    1 -
 drivers/net/igb/igb_main.c    |    1 -
 3 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index 86492c8..d27dcd1 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -254,7 +254,6 @@ struct igb_adapter {
 	int num_rx_queues;
 
 	u64 hw_csum_err;
-	u64 hw_csum_good;
 	u32 alloc_rx_buff_failed;
 	u32 gorc;
 	u64 gorc_old;
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index f71276f..2929546 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -95,7 +95,6 @@ static const struct igb_stats igb_gstrings_stats[] = {
 	{ "tx_flow_control_xon", IGB_STAT(stats.xontxc) },
 	{ "tx_flow_control_xoff", IGB_STAT(stats.xofftxc) },
 	{ "rx_long_byte_count", IGB_STAT(stats.gorc) },
-	{ "rx_csum_offload_good", IGB_STAT(hw_csum_good) },
 	{ "rx_csum_offload_errors", IGB_STAT(hw_csum_err) },
 	{ "tx_dma_out_of_sync", IGB_STAT(stats.doosync) },
 	{ "alloc_rx_buff_failed", IGB_STAT(alloc_rx_buff_failed) },
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index c15eb4c..e1d1c0c 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -4827,7 +4827,6 @@ static inline void igb_rx_checksum_adv(struct igb_adapter *adapter,
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	dev_dbg(&adapter->pdev->dev, "cksum success: bits %08X\n", status_err);
-	adapter->hw_csum_good++;
 }
 
 static inline u16 igb_get_hlen(struct igb_adapter *adapter,


^ permalink raw reply related

* [net-next-2.6 PATCH 01/20] igb: add new data structure for handling interrupts and NAPI
From: Jeff Kirsher @ 2009-10-28  1:49 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

Add a new igb_q_vector data structure to handle interrupts and NAPI.  This
helps to abstract the rings away from the adapter struct.  In addition it
allows for a bit of consolidation since a tx and rx ring can share a
q_vector.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h         |   46 +-
 drivers/net/igb/igb_ethtool.c |    8 
 drivers/net/igb/igb_main.c    |  872 ++++++++++++++++++++++++-----------------
 3 files changed, 534 insertions(+), 392 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index b805b1c..86492c8 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -55,6 +55,8 @@ struct igb_adapter;
 #define IGB_DEFAULT_ITR                    3 /* dynamic */
 #define IGB_MAX_ITR_USECS              10000
 #define IGB_MIN_ITR_USECS                 10
+#define NON_Q_VECTORS                      1
+#define MAX_Q_VECTORS                      8
 
 /* Transmit and receive queues */
 #define IGB_MAX_RX_QUEUES     (adapter->vfs_allocated_count ? \
@@ -149,25 +151,38 @@ struct igb_rx_queue_stats {
 	u64 drops;
 };
 
-struct igb_ring {
+struct igb_q_vector {
 	struct igb_adapter *adapter; /* backlink */
-	void *desc;                  /* descriptor ring memory */
-	dma_addr_t dma;              /* phys address of the ring */
-	unsigned int size;           /* length of desc. ring in bytes */
-	unsigned int count;          /* number of desc. in the ring */
+	struct igb_ring *rx_ring;
+	struct igb_ring *tx_ring;
+	struct napi_struct napi;
+
+	u32 eims_value;
+	u16 cpu;
+
+	u16 itr_val;
+	u8 set_itr;
+	u8 itr_shift;
+	void __iomem *itr_register;
+
+	char name[IFNAMSIZ + 9];
+};
+
+struct igb_ring {
+	struct igb_q_vector *q_vector; /* backlink to q_vector */
+	void *desc;                    /* descriptor ring memory */
+	dma_addr_t dma;                /* phys address of the ring */
+	unsigned int size;             /* length of desc. ring in bytes */
+	unsigned int count;            /* number of desc. in the ring */
 	u16 next_to_use;
 	u16 next_to_clean;
 	u16 head;
 	u16 tail;
 	struct igb_buffer *buffer_info; /* array of buffer info structs */
 
-	u32 eims_value;
-	u32 itr_val;
-	u16 itr_register;
-	u16 cpu;
+	u8 queue_index;
+	u8 reg_idx;
 
-	u16 queue_index;
-	u16 reg_idx;
 	unsigned int total_bytes;
 	unsigned int total_packets;
 
@@ -181,13 +196,8 @@ struct igb_ring {
 		struct {
 			struct igb_rx_queue_stats rx_stats;
 			u64 rx_queue_drops;
-			struct napi_struct napi;
-			int set_itr;
-			struct igb_ring *buddy;
 		};
 	};
-
-	char name[IFNAMSIZ + 5];
 };
 
 #define E1000_RX_DESC_ADV(R, i)	    \
@@ -254,7 +264,6 @@ struct igb_adapter {
 
 	/* OS defined structs */
 	struct net_device *netdev;
-	struct napi_struct napi;
 	struct pci_dev *pdev;
 	struct cyclecounter cycles;
 	struct timecounter clock;
@@ -272,6 +281,9 @@ struct igb_adapter {
 	struct igb_ring test_rx_ring;
 
 	int msg_enable;
+
+	unsigned int num_q_vectors;
+	struct igb_q_vector *q_vector[MAX_Q_VECTORS];
 	struct msix_entry *msix_entries;
 	u32 eims_enable_mask;
 	u32 eims_other;
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index dafb25b..f71276f 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -1907,7 +1907,6 @@ static int igb_set_coalesce(struct net_device *netdev,
 			    struct ethtool_coalesce *ec)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
-	struct e1000_hw *hw = &adapter->hw;
 	int i;
 
 	if ((ec->rx_coalesce_usecs > IGB_MAX_ITR_USECS) ||
@@ -1925,8 +1924,11 @@ static int igb_set_coalesce(struct net_device *netdev,
 		adapter->itr = adapter->itr_setting;
 	}
 
-	for (i = 0; i < adapter->num_rx_queues; i++)
-		wr32(adapter->rx_ring[i].itr_register, adapter->itr);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		q_vector->itr_val = adapter->itr;
+		q_vector->set_itr = 1;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 2ffe099..c15eb4c 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -111,16 +111,14 @@ static void igb_set_uta(struct igb_adapter *adapter);
 static irqreturn_t igb_intr(int irq, void *);
 static irqreturn_t igb_intr_msi(int irq, void *);
 static irqreturn_t igb_msix_other(int irq, void *);
-static irqreturn_t igb_msix_rx(int irq, void *);
-static irqreturn_t igb_msix_tx(int irq, void *);
+static irqreturn_t igb_msix_ring(int irq, void *);
 #ifdef CONFIG_IGB_DCA
-static void igb_update_rx_dca(struct igb_ring *);
-static void igb_update_tx_dca(struct igb_ring *);
+static void igb_update_dca(struct igb_q_vector *);
 static void igb_setup_dca(struct igb_adapter *);
 #endif /* CONFIG_IGB_DCA */
-static bool igb_clean_tx_irq(struct igb_ring *);
+static bool igb_clean_tx_irq(struct igb_q_vector *);
 static int igb_poll(struct napi_struct *, int);
-static bool igb_clean_rx_irq_adv(struct igb_ring *, int *, int);
+static bool igb_clean_rx_irq_adv(struct igb_q_vector *, int *, int);
 static void igb_alloc_rx_buffers_adv(struct igb_ring *, int);
 static int igb_ioctl(struct net_device *, struct ifreq *, int cmd);
 static void igb_tx_timeout(struct net_device *);
@@ -374,7 +372,7 @@ module_exit(igb_exit_module);
 static void igb_cache_ring_register(struct igb_adapter *adapter)
 {
 	int i;
-	unsigned int rbase_offset = adapter->vfs_allocated_count;
+	u32 rbase_offset = adapter->vfs_allocated_count;
 
 	switch (adapter->hw.mac.type) {
 	case e1000_82576:
@@ -400,6 +398,18 @@ static void igb_cache_ring_register(struct igb_adapter *adapter)
 	}
 }
 
+static void igb_free_queues(struct igb_adapter *adapter)
+{
+	kfree(adapter->tx_ring);
+	kfree(adapter->rx_ring);
+
+	adapter->tx_ring = NULL;
+	adapter->rx_ring = NULL;
+
+	adapter->num_rx_queues = 0;
+	adapter->num_tx_queues = 0;
+}
+
 /**
  * igb_alloc_queues - Allocate memory for all rings
  * @adapter: board private structure to initialize
@@ -414,59 +424,48 @@ static int igb_alloc_queues(struct igb_adapter *adapter)
 	adapter->tx_ring = kcalloc(adapter->num_tx_queues,
 				   sizeof(struct igb_ring), GFP_KERNEL);
 	if (!adapter->tx_ring)
-		return -ENOMEM;
+		goto err;
 
 	adapter->rx_ring = kcalloc(adapter->num_rx_queues,
 				   sizeof(struct igb_ring), GFP_KERNEL);
-	if (!adapter->rx_ring) {
-		kfree(adapter->tx_ring);
-		return -ENOMEM;
-	}
-
-	adapter->rx_ring->buddy = adapter->tx_ring;
+	if (!adapter->rx_ring)
+		goto err;
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
 		struct igb_ring *ring = &(adapter->tx_ring[i]);
 		ring->count = adapter->tx_ring_count;
-		ring->adapter = adapter;
 		ring->queue_index = i;
 	}
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		struct igb_ring *ring = &(adapter->rx_ring[i]);
 		ring->count = adapter->rx_ring_count;
-		ring->adapter = adapter;
 		ring->queue_index = i;
-		ring->itr_register = E1000_ITR;
-
-		/* set a default napi handler for each rx_ring */
-		netif_napi_add(adapter->netdev, &ring->napi, igb_poll, 64);
 	}
 
 	igb_cache_ring_register(adapter);
-	return 0;
-}
 
-static void igb_free_queues(struct igb_adapter *adapter)
-{
-	int i;
-
-	for (i = 0; i < adapter->num_rx_queues; i++)
-		netif_napi_del(&adapter->rx_ring[i].napi);
+	return 0;
 
-	adapter->num_rx_queues = 0;
-	adapter->num_tx_queues = 0;
+err:
+	igb_free_queues(adapter);
 
-	kfree(adapter->tx_ring);
-	kfree(adapter->rx_ring);
+	return -ENOMEM;
 }
 
 #define IGB_N0_QUEUE -1
-static void igb_assign_vector(struct igb_adapter *adapter, int rx_queue,
-			      int tx_queue, int msix_vector)
+static void igb_assign_vector(struct igb_q_vector *q_vector, int msix_vector)
 {
 	u32 msixbm = 0;
+	struct igb_adapter *adapter = q_vector->adapter;
 	struct e1000_hw *hw = &adapter->hw;
 	u32 ivar, index;
+	int rx_queue = IGB_N0_QUEUE;
+	int tx_queue = IGB_N0_QUEUE;
+
+	if (q_vector->rx_ring)
+		rx_queue = q_vector->rx_ring->reg_idx;
+	if (q_vector->tx_ring)
+		tx_queue = q_vector->tx_ring->reg_idx;
 
 	switch (hw->mac.type) {
 	case e1000_82575:
@@ -474,16 +473,12 @@ static void igb_assign_vector(struct igb_adapter *adapter, int rx_queue,
 		   bitmask for the EICR/EIMS/EIMC registers.  To assign one
 		   or more queues to a vector, we write the appropriate bits
 		   into the MSIXBM register for that vector. */
-		if (rx_queue > IGB_N0_QUEUE) {
+		if (rx_queue > IGB_N0_QUEUE)
 			msixbm = E1000_EICR_RX_QUEUE0 << rx_queue;
-			adapter->rx_ring[rx_queue].eims_value = msixbm;
-		}
-		if (tx_queue > IGB_N0_QUEUE) {
+		if (tx_queue > IGB_N0_QUEUE)
 			msixbm |= E1000_EICR_TX_QUEUE0 << tx_queue;
-			adapter->tx_ring[tx_queue].eims_value =
-				  E1000_EICR_TX_QUEUE0 << tx_queue;
-		}
 		array_wr32(E1000_MSIXBM(0), msix_vector, msixbm);
+		q_vector->eims_value = msixbm;
 		break;
 	case e1000_82576:
 		/* 82576 uses a table-based method for assigning vectors.
@@ -491,35 +486,34 @@ static void igb_assign_vector(struct igb_adapter *adapter, int rx_queue,
 		   a vector number along with a "valid" bit.  Sadly, the layout
 		   of the table is somewhat counterintuitive. */
 		if (rx_queue > IGB_N0_QUEUE) {
-			index = (rx_queue >> 1) + adapter->vfs_allocated_count;
+			index = (rx_queue & 0x7);
 			ivar = array_rd32(E1000_IVAR0, index);
-			if (rx_queue & 0x1) {
-				/* vector goes into third byte of register */
-				ivar = ivar & 0xFF00FFFF;
-				ivar |= (msix_vector | E1000_IVAR_VALID) << 16;
-			} else {
+			if (rx_queue < 8) {
 				/* vector goes into low byte of register */
 				ivar = ivar & 0xFFFFFF00;
 				ivar |= msix_vector | E1000_IVAR_VALID;
+			} else {
+				/* vector goes into third byte of register */
+				ivar = ivar & 0xFF00FFFF;
+				ivar |= (msix_vector | E1000_IVAR_VALID) << 16;
 			}
-			adapter->rx_ring[rx_queue].eims_value= 1 << msix_vector;
 			array_wr32(E1000_IVAR0, index, ivar);
 		}
 		if (tx_queue > IGB_N0_QUEUE) {
-			index = (tx_queue >> 1) + adapter->vfs_allocated_count;
+			index = (tx_queue & 0x7);
 			ivar = array_rd32(E1000_IVAR0, index);
-			if (tx_queue & 0x1) {
-				/* vector goes into high byte of register */
-				ivar = ivar & 0x00FFFFFF;
-				ivar |= (msix_vector | E1000_IVAR_VALID) << 24;
-			} else {
+			if (tx_queue < 8) {
 				/* vector goes into second byte of register */
 				ivar = ivar & 0xFFFF00FF;
 				ivar |= (msix_vector | E1000_IVAR_VALID) << 8;
+			} else {
+				/* vector goes into high byte of register */
+				ivar = ivar & 0x00FFFFFF;
+				ivar |= (msix_vector | E1000_IVAR_VALID) << 24;
 			}
-			adapter->tx_ring[tx_queue].eims_value= 1 << msix_vector;
 			array_wr32(E1000_IVAR0, index, ivar);
 		}
+		q_vector->eims_value = 1 << msix_vector;
 		break;
 	default:
 		BUG();
@@ -540,43 +534,10 @@ static void igb_configure_msix(struct igb_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 
 	adapter->eims_enable_mask = 0;
-	if (hw->mac.type == e1000_82576)
-		/* Turn on MSI-X capability first, or our settings
-		 * won't stick.  And it will take days to debug. */
-		wr32(E1000_GPIE, E1000_GPIE_MSIX_MODE |
-				   E1000_GPIE_PBA | E1000_GPIE_EIAME |
- 				   E1000_GPIE_NSICR);
-
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct igb_ring *tx_ring = &adapter->tx_ring[i];
-		igb_assign_vector(adapter, IGB_N0_QUEUE, i, vector++);
-		adapter->eims_enable_mask |= tx_ring->eims_value;
-		if (tx_ring->itr_val)
-			writel(tx_ring->itr_val,
-			       hw->hw_addr + tx_ring->itr_register);
-		else
-			writel(1, hw->hw_addr + tx_ring->itr_register);
-	}
-
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct igb_ring *rx_ring = &adapter->rx_ring[i];
-		rx_ring->buddy = NULL;
-		igb_assign_vector(adapter, i, IGB_N0_QUEUE, vector++);
-		adapter->eims_enable_mask |= rx_ring->eims_value;
-		if (rx_ring->itr_val)
-			writel(rx_ring->itr_val,
-			       hw->hw_addr + rx_ring->itr_register);
-		else
-			writel(1, hw->hw_addr + rx_ring->itr_register);
-	}
-
 
 	/* set vector for other causes, i.e. link changes */
 	switch (hw->mac.type) {
 	case e1000_82575:
-		array_wr32(E1000_MSIXBM(0), vector++,
-				      E1000_EIMS_OTHER);
-
 		tmp = rd32(E1000_CTRL_EXT);
 		/* enable MSI-X PBA support*/
 		tmp |= E1000_CTRL_EXT_PBA_CLR;
@@ -586,22 +547,40 @@ static void igb_configure_msix(struct igb_adapter *adapter)
 		tmp |= E1000_CTRL_EXT_IRCA;
 
 		wr32(E1000_CTRL_EXT, tmp);
-		adapter->eims_enable_mask |= E1000_EIMS_OTHER;
+
+		/* enable msix_other interrupt */
+		array_wr32(E1000_MSIXBM(0), vector++,
+		                      E1000_EIMS_OTHER);
 		adapter->eims_other = E1000_EIMS_OTHER;
 
 		break;
 
 	case e1000_82576:
+		/* Turn on MSI-X capability first, or our settings
+		 * won't stick.  And it will take days to debug. */
+		wr32(E1000_GPIE, E1000_GPIE_MSIX_MODE |
+		                E1000_GPIE_PBA | E1000_GPIE_EIAME |
+		                E1000_GPIE_NSICR);
+
+		/* enable msix_other interrupt */
+		adapter->eims_other = 1 << vector;
 		tmp = (vector++ | E1000_IVAR_VALID) << 8;
-		wr32(E1000_IVAR_MISC, tmp);
 
-		adapter->eims_enable_mask = (1 << (vector)) - 1;
-		adapter->eims_other = 1 << (vector - 1);
+		wr32(E1000_IVAR_MISC, tmp);
 		break;
 	default:
 		/* do nothing, since nothing else supports MSI-X */
 		break;
 	} /* switch (hw->mac.type) */
+
+	adapter->eims_enable_mask |= adapter->eims_other;
+
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		igb_assign_vector(q_vector, vector++);
+		adapter->eims_enable_mask |= q_vector->eims_value;
+	}
+
 	wrfl();
 }
 
@@ -614,43 +593,40 @@ static void igb_configure_msix(struct igb_adapter *adapter)
 static int igb_request_msix(struct igb_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
 	int i, err = 0, vector = 0;
 
-	vector = 0;
-
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct igb_ring *ring = &(adapter->tx_ring[i]);
-		sprintf(ring->name, "%s-tx-%d", netdev->name, i);
-		err = request_irq(adapter->msix_entries[vector].vector,
-				  &igb_msix_tx, 0, ring->name,
-				  &(adapter->tx_ring[i]));
-		if (err)
-			goto out;
-		ring->itr_register = E1000_EITR(0) + (vector << 2);
-		ring->itr_val = 976; /* ~4000 ints/sec */
-		vector++;
-	}
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct igb_ring *ring = &(adapter->rx_ring[i]);
-		if (strlen(netdev->name) < (IFNAMSIZ - 5))
-			sprintf(ring->name, "%s-rx-%d", netdev->name, i);
+	err = request_irq(adapter->msix_entries[vector].vector,
+	                  &igb_msix_other, 0, netdev->name, adapter);
+	if (err)
+		goto out;
+	vector++;
+
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+
+		q_vector->itr_register = hw->hw_addr + E1000_EITR(vector);
+
+		if (q_vector->rx_ring && q_vector->tx_ring)
+			sprintf(q_vector->name, "%s-TxRx-%u", netdev->name,
+			        q_vector->rx_ring->queue_index);
+		else if (q_vector->tx_ring)
+			sprintf(q_vector->name, "%s-tx-%u", netdev->name,
+			        q_vector->tx_ring->queue_index);
+		else if (q_vector->rx_ring)
+			sprintf(q_vector->name, "%s-rx-%u", netdev->name,
+			        q_vector->rx_ring->queue_index);
 		else
-			memcpy(ring->name, netdev->name, IFNAMSIZ);
+			sprintf(q_vector->name, "%s-unused", netdev->name);
+
 		err = request_irq(adapter->msix_entries[vector].vector,
-				  &igb_msix_rx, 0, ring->name,
-				  &(adapter->rx_ring[i]));
+		                  &igb_msix_ring, 0, q_vector->name,
+		                  q_vector);
 		if (err)
 			goto out;
-		ring->itr_register = E1000_EITR(0) + (vector << 2);
-		ring->itr_val = adapter->itr;
 		vector++;
 	}
 
-	err = request_irq(adapter->msix_entries[vector].vector,
-			  &igb_msix_other, 0, netdev->name, netdev);
-	if (err)
-		goto out;
-
 	igb_configure_msix(adapter);
 	return 0;
 out:
@@ -663,11 +639,44 @@ static void igb_reset_interrupt_capability(struct igb_adapter *adapter)
 		pci_disable_msix(adapter->pdev);
 		kfree(adapter->msix_entries);
 		adapter->msix_entries = NULL;
-	} else if (adapter->flags & IGB_FLAG_HAS_MSI)
+	} else if (adapter->flags & IGB_FLAG_HAS_MSI) {
 		pci_disable_msi(adapter->pdev);
-	return;
+	}
 }
 
+/**
+ * igb_free_q_vectors - Free memory allocated for interrupt vectors
+ * @adapter: board private structure to initialize
+ *
+ * This function frees the memory allocated to the q_vectors.  In addition if
+ * NAPI is enabled it will delete any references to the NAPI struct prior
+ * to freeing the q_vector.
+ **/
+static void igb_free_q_vectors(struct igb_adapter *adapter)
+{
+	int v_idx;
+
+	for (v_idx = 0; v_idx < adapter->num_q_vectors; v_idx++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[v_idx];
+		adapter->q_vector[v_idx] = NULL;
+		netif_napi_del(&q_vector->napi);
+		kfree(q_vector);
+	}
+	adapter->num_q_vectors = 0;
+}
+
+/**
+ * igb_clear_interrupt_scheme - reset the device to a state of no interrupts
+ *
+ * This function resets the device so that it has 0 rx queues, tx queues, and
+ * MSI-X interrupts allocated.
+ */
+static void igb_clear_interrupt_scheme(struct igb_adapter *adapter)
+{
+	igb_free_queues(adapter);
+	igb_free_q_vectors(adapter);
+	igb_reset_interrupt_capability(adapter);
+}
 
 /**
  * igb_set_interrupt_capability - set MSI or MSI-X if supported
@@ -681,11 +690,20 @@ static void igb_set_interrupt_capability(struct igb_adapter *adapter)
 	int numvecs, i;
 
 	/* Number of supported queues. */
-	/* Having more queues than CPUs doesn't make sense. */
 	adapter->num_rx_queues = min_t(u32, IGB_MAX_RX_QUEUES, num_online_cpus());
 	adapter->num_tx_queues = min_t(u32, IGB_MAX_TX_QUEUES, num_online_cpus());
 
-	numvecs = adapter->num_tx_queues + adapter->num_rx_queues + 1;
+	/* start with one vector for every rx queue */
+	numvecs = adapter->num_rx_queues;
+
+	/* if tx handler is seperate add 1 for every tx queue */
+	numvecs += adapter->num_tx_queues;
+
+	/* store the number of vectors reserved for queues */
+	adapter->num_q_vectors = numvecs;
+
+	/* add 1 vector for link status interrupts */
+	numvecs++;
 	adapter->msix_entries = kcalloc(numvecs, sizeof(struct msix_entry),
 					GFP_KERNEL);
 	if (!adapter->msix_entries)
@@ -721,6 +739,7 @@ msi_only:
 #endif
 	adapter->num_rx_queues = 1;
 	adapter->num_tx_queues = 1;
+	adapter->num_q_vectors = 1;
 	if (!pci_enable_msi(adapter->pdev))
 		adapter->flags |= IGB_FLAG_HAS_MSI;
 out:
@@ -730,6 +749,139 @@ out:
 }
 
 /**
+ * igb_alloc_q_vectors - Allocate memory for interrupt vectors
+ * @adapter: board private structure to initialize
+ *
+ * We allocate one q_vector per queue interrupt.  If allocation fails we
+ * return -ENOMEM.
+ **/
+static int igb_alloc_q_vectors(struct igb_adapter *adapter)
+{
+	struct igb_q_vector *q_vector;
+	struct e1000_hw *hw = &adapter->hw;
+	int v_idx;
+
+	for (v_idx = 0; v_idx < adapter->num_q_vectors; v_idx++) {
+		q_vector = kzalloc(sizeof(struct igb_q_vector), GFP_KERNEL);
+		if (!q_vector)
+			goto err_out;
+		q_vector->adapter = adapter;
+		q_vector->itr_shift = (hw->mac.type == e1000_82575) ? 16 : 0;
+		q_vector->itr_register = hw->hw_addr + E1000_EITR(0);
+		q_vector->itr_val = IGB_START_ITR;
+		q_vector->set_itr = 1;
+		netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64);
+		adapter->q_vector[v_idx] = q_vector;
+	}
+	return 0;
+
+err_out:
+	while (v_idx) {
+		v_idx--;
+		q_vector = adapter->q_vector[v_idx];
+		netif_napi_del(&q_vector->napi);
+		kfree(q_vector);
+		adapter->q_vector[v_idx] = NULL;
+	}
+	return -ENOMEM;
+}
+
+static void igb_map_rx_ring_to_vector(struct igb_adapter *adapter,
+                                      int ring_idx, int v_idx)
+{
+	struct igb_q_vector *q_vector;
+
+	q_vector = adapter->q_vector[v_idx];
+	q_vector->rx_ring = &adapter->rx_ring[ring_idx];
+	q_vector->rx_ring->q_vector = q_vector;
+	q_vector->itr_val = adapter->itr;
+}
+
+static void igb_map_tx_ring_to_vector(struct igb_adapter *adapter,
+                                      int ring_idx, int v_idx)
+{
+	struct igb_q_vector *q_vector;
+
+	q_vector = adapter->q_vector[v_idx];
+	q_vector->tx_ring = &adapter->tx_ring[ring_idx];
+	q_vector->tx_ring->q_vector = q_vector;
+	q_vector->itr_val = adapter->itr;
+}
+
+/**
+ * igb_map_ring_to_vector - maps allocated queues to vectors
+ *
+ * This function maps the recently allocated queues to vectors.
+ **/
+static int igb_map_ring_to_vector(struct igb_adapter *adapter)
+{
+	int i;
+	int v_idx = 0;
+
+	if ((adapter->num_q_vectors < adapter->num_rx_queues) ||
+	    (adapter->num_q_vectors < adapter->num_tx_queues))
+		return -ENOMEM;
+
+	if (adapter->num_q_vectors >=
+	    (adapter->num_rx_queues + adapter->num_tx_queues)) {
+		for (i = 0; i < adapter->num_rx_queues; i++)
+			igb_map_rx_ring_to_vector(adapter, i, v_idx++);
+		for (i = 0; i < adapter->num_tx_queues; i++)
+			igb_map_tx_ring_to_vector(adapter, i, v_idx++);
+	} else {
+		for (i = 0; i < adapter->num_rx_queues; i++) {
+			if (i < adapter->num_tx_queues)
+				igb_map_tx_ring_to_vector(adapter, i, v_idx);
+			igb_map_rx_ring_to_vector(adapter, i, v_idx++);
+		}
+		for (; i < adapter->num_tx_queues; i++)
+			igb_map_tx_ring_to_vector(adapter, i, v_idx++);
+	}
+	return 0;
+}
+
+/**
+ * igb_init_interrupt_scheme - initialize interrupts, allocate queues/vectors
+ *
+ * This function initializes the interrupts and allocates all of the queues.
+ **/
+static int igb_init_interrupt_scheme(struct igb_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	int err;
+
+	igb_set_interrupt_capability(adapter);
+
+	err = igb_alloc_q_vectors(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Unable to allocate memory for vectors\n");
+		goto err_alloc_q_vectors;
+	}
+
+	err = igb_alloc_queues(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Unable to allocate memory for queues\n");
+		goto err_alloc_queues;
+	}
+
+	err = igb_map_ring_to_vector(adapter);
+	if (err) {
+		dev_err(&pdev->dev, "Invalid q_vector to ring mapping\n");
+		goto err_map_queues;
+	}
+
+
+	return 0;
+err_map_queues:
+	igb_free_queues(adapter);
+err_alloc_queues:
+	igb_free_q_vectors(adapter);
+err_alloc_q_vectors:
+	igb_reset_interrupt_capability(adapter);
+	return err;
+}
+
+/**
  * igb_request_irq - initialize interrupts
  *
  * Attempts to configure interrupts using the best available
@@ -738,6 +890,7 @@ out:
 static int igb_request_irq(struct igb_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
+	struct pci_dev *pdev = adapter->pdev;
 	struct e1000_hw *hw = &adapter->hw;
 	int err = 0;
 
@@ -746,18 +899,36 @@ static int igb_request_irq(struct igb_adapter *adapter)
 		if (!err)
 			goto request_done;
 		/* fall back to MSI */
-		igb_reset_interrupt_capability(adapter);
+		igb_clear_interrupt_scheme(adapter);
 		if (!pci_enable_msi(adapter->pdev))
 			adapter->flags |= IGB_FLAG_HAS_MSI;
 		igb_free_all_tx_resources(adapter);
 		igb_free_all_rx_resources(adapter);
+		adapter->num_tx_queues = 1;
 		adapter->num_rx_queues = 1;
-		igb_alloc_queues(adapter);
+		adapter->num_q_vectors = 1;
+		err = igb_alloc_q_vectors(adapter);
+		if (err) {
+			dev_err(&pdev->dev,
+			        "Unable to allocate memory for vectors\n");
+			goto request_done;
+		}
+		err = igb_alloc_queues(adapter);
+		if (err) {
+			dev_err(&pdev->dev,
+			        "Unable to allocate memory for queues\n");
+			igb_free_q_vectors(adapter);
+			goto request_done;
+		}
+		igb_setup_all_tx_resources(adapter);
+		igb_setup_all_rx_resources(adapter);
 	} else {
 		switch (hw->mac.type) {
 		case e1000_82575:
 			wr32(E1000_MSIXBM(0),
-			     (E1000_EICR_RX_QUEUE0 | E1000_EIMS_OTHER));
+			     (E1000_EICR_RX_QUEUE0 |
+			      E1000_EICR_TX_QUEUE0 |
+			      E1000_EIMS_OTHER));
 			break;
 		case e1000_82576:
 			wr32(E1000_IVAR0, E1000_IVAR_VALID);
@@ -769,16 +940,17 @@ static int igb_request_irq(struct igb_adapter *adapter)
 
 	if (adapter->flags & IGB_FLAG_HAS_MSI) {
 		err = request_irq(adapter->pdev->irq, &igb_intr_msi, 0,
-				  netdev->name, netdev);
+				  netdev->name, adapter);
 		if (!err)
 			goto request_done;
+
 		/* fall back to legacy interrupts */
 		igb_reset_interrupt_capability(adapter);
 		adapter->flags &= ~IGB_FLAG_HAS_MSI;
 	}
 
 	err = request_irq(adapter->pdev->irq, &igb_intr, IRQF_SHARED,
-			  netdev->name, netdev);
+			  netdev->name, adapter);
 
 	if (err)
 		dev_err(&adapter->pdev->dev, "Error %d getting interrupt\n",
@@ -790,23 +962,19 @@ request_done:
 
 static void igb_free_irq(struct igb_adapter *adapter)
 {
-	struct net_device *netdev = adapter->netdev;
-
 	if (adapter->msix_entries) {
 		int vector = 0, i;
 
-		for (i = 0; i < adapter->num_tx_queues; i++)
-			free_irq(adapter->msix_entries[vector++].vector,
-				&(adapter->tx_ring[i]));
-		for (i = 0; i < adapter->num_rx_queues; i++)
-			free_irq(adapter->msix_entries[vector++].vector,
-				&(adapter->rx_ring[i]));
+		free_irq(adapter->msix_entries[vector++].vector, adapter);
 
-		free_irq(adapter->msix_entries[vector++].vector, netdev);
-		return;
+		for (i = 0; i < adapter->num_q_vectors; i++) {
+			struct igb_q_vector *q_vector = adapter->q_vector[i];
+			free_irq(adapter->msix_entries[vector++].vector,
+			         q_vector);
+		}
+	} else {
+		free_irq(adapter->pdev->irq, adapter);
 	}
-
-	free_irq(adapter->pdev->irq, netdev);
 }
 
 /**
@@ -967,8 +1135,10 @@ int igb_up(struct igb_adapter *adapter)
 
 	clear_bit(__IGB_DOWN, &adapter->state);
 
-	for (i = 0; i < adapter->num_rx_queues; i++)
-		napi_enable(&adapter->rx_ring[i].napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		napi_enable(&q_vector->napi);
+	}
 	if (adapter->msix_entries)
 		igb_configure_msix(adapter);
 
@@ -1012,8 +1182,10 @@ void igb_down(struct igb_adapter *adapter)
 	wrfl();
 	msleep(10);
 
-	for (i = 0; i < adapter->num_rx_queues; i++)
-		napi_disable(&adapter->rx_ring[i].napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		napi_disable(&q_vector->napi);
+	}
 
 	igb_irq_disable(adapter);
 
@@ -1584,9 +1756,8 @@ err_eeprom:
 
 	if (hw->flash_address)
 		iounmap(hw->flash_address);
-
-	igb_free_queues(adapter);
 err_sw_init:
+	igb_clear_interrupt_scheme(adapter);
 	iounmap(hw->hw_addr);
 err_ioremap:
 	free_netdev(netdev);
@@ -1640,9 +1811,7 @@ static void __devexit igb_remove(struct pci_dev *pdev)
 	if (!igb_check_reset_block(&adapter->hw))
 		igb_reset_phy(&adapter->hw);
 
-	igb_reset_interrupt_capability(adapter);
-
-	igb_free_queues(adapter);
+	igb_clear_interrupt_scheme(adapter);
 
 #ifdef CONFIG_PCI_IOV
 	/* reclaim resources allocated to VFs */
@@ -1696,9 +1865,7 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
 
 	/* This call may decrease the number of queues depending on
 	 * interrupt mode. */
-	igb_set_interrupt_capability(adapter);
-
-	if (igb_alloc_queues(adapter)) {
+	if (igb_init_interrupt_scheme(adapter)) {
 		dev_err(&pdev->dev, "Unable to allocate memory for queues\n");
 		return -ENOMEM;
 	}
@@ -1768,8 +1935,10 @@ static int igb_open(struct net_device *netdev)
 	/* From here on the code is the same as igb_up() */
 	clear_bit(__IGB_DOWN, &adapter->state);
 
-	for (i = 0; i < adapter->num_rx_queues; i++)
-		napi_enable(&adapter->rx_ring[i].napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		napi_enable(&q_vector->napi);
+	}
 
 	/* Clear any pending interrupts. */
 	rd32(E1000_ICR);
@@ -1858,14 +2027,13 @@ int igb_setup_tx_resources(struct igb_adapter *adapter,
 	if (!tx_ring->desc)
 		goto err;
 
-	tx_ring->adapter = adapter;
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 	return 0;
 
 err:
 	vfree(tx_ring->buffer_info);
-	dev_err(&adapter->pdev->dev,
+	dev_err(&pdev->dev,
 		"Unable to allocate memory for the transmit descriptor ring\n");
 	return -ENOMEM;
 }
@@ -1996,8 +2164,6 @@ int igb_setup_rx_resources(struct igb_adapter *adapter,
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
-	rx_ring->adapter = adapter;
-
 	return 0;
 
 err:
@@ -2308,7 +2474,7 @@ static void igb_configure_rx(struct igb_adapter *adapter)
  **/
 void igb_free_tx_resources(struct igb_ring *tx_ring)
 {
-	struct pci_dev *pdev = tx_ring->adapter->pdev;
+	struct pci_dev *pdev = tx_ring->q_vector->adapter->pdev;
 
 	igb_clean_tx_ring(tx_ring);
 
@@ -2354,7 +2520,7 @@ static void igb_unmap_and_free_tx_resource(struct igb_adapter *adapter,
  **/
 static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 {
-	struct igb_adapter *adapter = tx_ring->adapter;
+	struct igb_adapter *adapter = tx_ring->q_vector->adapter;
 	struct igb_buffer *buffer_info;
 	unsigned long size;
 	unsigned int i;
@@ -2402,7 +2568,7 @@ static void igb_clean_all_tx_rings(struct igb_adapter *adapter)
  **/
 void igb_free_rx_resources(struct igb_ring *rx_ring)
 {
-	struct pci_dev *pdev = rx_ring->adapter->pdev;
+	struct pci_dev *pdev = rx_ring->q_vector->adapter->pdev;
 
 	igb_clean_rx_ring(rx_ring);
 
@@ -2434,7 +2600,7 @@ static void igb_free_all_rx_resources(struct igb_adapter *adapter)
  **/
 static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 {
-	struct igb_adapter *adapter = rx_ring->adapter;
+	struct igb_adapter *adapter = rx_ring->q_vector->adapter;
 	struct igb_buffer *buffer_info;
 	struct pci_dev *pdev = adapter->pdev;
 	unsigned long size;
@@ -2749,7 +2915,6 @@ static void igb_watchdog_task(struct work_struct *work)
 	struct net_device *netdev = adapter->netdev;
 	struct igb_ring *tx_ring = adapter->tx_ring;
 	u32 link;
-	u32 eics = 0;
 	int i;
 
 	link = igb_has_link(adapter);
@@ -2848,8 +3013,11 @@ link_up:
 
 	/* Cause software interrupt to ensure rx ring is cleaned */
 	if (adapter->msix_entries) {
-		for (i = 0; i < adapter->num_rx_queues; i++)
-			eics |= adapter->rx_ring[i].eims_value;
+		u32 eics = 0;
+		for (i = 0; i < adapter->num_q_vectors; i++) {
+			struct igb_q_vector *q_vector = adapter->q_vector[i];
+			eics |= q_vector->eims_value;
+		}
 		wr32(E1000_EICS, eics);
 	} else {
 		wr32(E1000_ICS, E1000_ICS_RXDMT0);
@@ -2886,25 +3054,37 @@ enum latency_range {
  *      parameter (see igb_param.c)
  *      NOTE:  This function is called only when operating in a multiqueue
  *             receive environment.
- * @rx_ring: pointer to ring
+ * @q_vector: pointer to q_vector
  **/
-static void igb_update_ring_itr(struct igb_ring *rx_ring)
+static void igb_update_ring_itr(struct igb_q_vector *q_vector)
 {
-	int new_val = rx_ring->itr_val;
+	int new_val = q_vector->itr_val;
 	int avg_wire_size = 0;
-	struct igb_adapter *adapter = rx_ring->adapter;
-
-	if (!rx_ring->total_packets)
-		goto clear_counts; /* no packets, so don't do anything */
+	struct igb_adapter *adapter = q_vector->adapter;
 
 	/* For non-gigabit speeds, just fix the interrupt rate at 4000
 	 * ints/sec - ITR timer value of 120 ticks.
 	 */
 	if (adapter->link_speed != SPEED_1000) {
-		new_val = 120;
+		new_val = 976;
 		goto set_itr_val;
 	}
-	avg_wire_size = rx_ring->total_bytes / rx_ring->total_packets;
+
+	if (q_vector->rx_ring && q_vector->rx_ring->total_packets) {
+		struct igb_ring *ring = q_vector->rx_ring;
+		avg_wire_size = ring->total_bytes / ring->total_packets;
+	}
+
+	if (q_vector->tx_ring && q_vector->tx_ring->total_packets) {
+		struct igb_ring *ring = q_vector->tx_ring;
+		avg_wire_size = max_t(u32, avg_wire_size,
+		                      (ring->total_bytes /
+		                       ring->total_packets));
+	}
+
+	/* if avg_wire_size isn't set no work was done */
+	if (!avg_wire_size)
+		goto clear_counts;
 
 	/* Add 24 bytes to size to account for CRC, preamble, and gap */
 	avg_wire_size += 24;
@@ -2919,13 +3099,19 @@ static void igb_update_ring_itr(struct igb_ring *rx_ring)
 		new_val = avg_wire_size / 2;
 
 set_itr_val:
-	if (new_val != rx_ring->itr_val) {
-		rx_ring->itr_val = new_val;
-		rx_ring->set_itr = 1;
+	if (new_val != q_vector->itr_val) {
+		q_vector->itr_val = new_val;
+		q_vector->set_itr = 1;
 	}
 clear_counts:
-	rx_ring->total_bytes = 0;
-	rx_ring->total_packets = 0;
+	if (q_vector->rx_ring) {
+		q_vector->rx_ring->total_bytes = 0;
+		q_vector->rx_ring->total_packets = 0;
+	}
+	if (q_vector->tx_ring) {
+		q_vector->tx_ring->total_bytes = 0;
+		q_vector->tx_ring->total_packets = 0;
+	}
 }
 
 /**
@@ -2942,7 +3128,7 @@ clear_counts:
  *      NOTE:  These calculations are only valid when operating in a single-
  *             queue environment.
  * @adapter: pointer to adapter
- * @itr_setting: current adapter->itr
+ * @itr_setting: current q_vector->itr_val
  * @packets: the number of packets during this measurement interval
  * @bytes: the number of bytes during this measurement interval
  **/
@@ -2994,8 +3180,9 @@ update_itr_done:
 
 static void igb_set_itr(struct igb_adapter *adapter)
 {
+	struct igb_q_vector *q_vector = adapter->q_vector[0];
 	u16 current_itr;
-	u32 new_itr = adapter->itr;
+	u32 new_itr = q_vector->itr_val;
 
 	/* for non-gigabit speeds, just fix the interrupt rate at 4000 */
 	if (adapter->link_speed != SPEED_1000) {
@@ -3009,15 +3196,11 @@ static void igb_set_itr(struct igb_adapter *adapter)
 				    adapter->rx_ring->total_packets,
 				    adapter->rx_ring->total_bytes);
 
-	if (adapter->rx_ring->buddy) {
-		adapter->tx_itr = igb_update_itr(adapter,
-					    adapter->tx_itr,
-					    adapter->tx_ring->total_packets,
-					    adapter->tx_ring->total_bytes);
-		current_itr = max(adapter->rx_itr, adapter->tx_itr);
-	} else {
-		current_itr = adapter->rx_itr;
-	}
+	adapter->tx_itr = igb_update_itr(adapter,
+				    adapter->tx_itr,
+				    adapter->tx_ring->total_packets,
+				    adapter->tx_ring->total_bytes);
+	current_itr = max(adapter->rx_itr, adapter->tx_itr);
 
 	/* conservative mode (itr 3) eliminates the lowest_latency setting */
 	if (adapter->itr_setting == 3 && current_itr == lowest_latency)
@@ -3041,18 +3224,17 @@ static void igb_set_itr(struct igb_adapter *adapter)
 set_itr_now:
 	adapter->rx_ring->total_bytes = 0;
 	adapter->rx_ring->total_packets = 0;
-	if (adapter->rx_ring->buddy) {
-		adapter->rx_ring->buddy->total_bytes = 0;
-		adapter->rx_ring->buddy->total_packets = 0;
-	}
+	adapter->tx_ring->total_bytes = 0;
+	adapter->tx_ring->total_packets = 0;
 
-	if (new_itr != adapter->itr) {
+	if (new_itr != q_vector->itr_val) {
 		/* this attempts to bias the interrupt rate towards Bulk
 		 * by adding intermediate steps when interrupt rate is
 		 * increasing */
-		new_itr = new_itr > adapter->itr ?
-			     max((new_itr * adapter->itr) /
-			         (new_itr + (adapter->itr >> 2)), new_itr) :
+		new_itr = new_itr > q_vector->itr_val ?
+		             max((new_itr * q_vector->itr_val) /
+		                 (new_itr + (q_vector->itr_val >> 2)),
+		                 new_itr) :
 			     new_itr;
 		/* Don't write the value here; it resets the adapter's
 		 * internal timer, and causes us to delay far longer than
@@ -3060,15 +3242,13 @@ set_itr_now:
 		 * value at the beginning of the next interrupt so the timing
 		 * ends up being correct.
 		 */
-		adapter->itr = new_itr;
-		adapter->rx_ring->itr_val = new_itr;
-		adapter->rx_ring->set_itr = 1;
+		q_vector->itr_val = new_itr;
+		q_vector->set_itr = 1;
 	}
 
 	return;
 }
 
-
 #define IGB_TX_FLAGS_CSUM		0x00000001
 #define IGB_TX_FLAGS_VLAN		0x00000002
 #define IGB_TX_FLAGS_TSO		0x00000004
@@ -3781,14 +3961,12 @@ void igb_update_stats(struct igb_adapter *adapter)
 
 static irqreturn_t igb_msix_other(int irq, void *data)
 {
-	struct net_device *netdev = data;
-	struct igb_adapter *adapter = netdev_priv(netdev);
+	struct igb_adapter *adapter = data;
 	struct e1000_hw *hw = &adapter->hw;
 	u32 icr = rd32(E1000_ICR);
-
 	/* reading ICR causes bit 31 of EICR to be cleared */
 
-	if(icr & E1000_ICR_DOUTSYNC) {
+	if (icr & E1000_ICR_DOUTSYNC) {
 		/* HW is reporting DMA is out of sync */
 		adapter->stats.doosync++;
 	}
@@ -3810,119 +3988,79 @@ static irqreturn_t igb_msix_other(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t igb_msix_tx(int irq, void *data)
+static void igb_write_itr(struct igb_q_vector *q_vector)
 {
-	struct igb_ring *tx_ring = data;
-	struct igb_adapter *adapter = tx_ring->adapter;
-	struct e1000_hw *hw = &adapter->hw;
+	u32 itr_val = q_vector->itr_val & 0x7FFC;
 
-#ifdef CONFIG_IGB_DCA
-	if (adapter->flags & IGB_FLAG_DCA_ENABLED)
-		igb_update_tx_dca(tx_ring);
-#endif
+	if (!q_vector->set_itr)
+		return;
 
-	tx_ring->total_bytes = 0;
-	tx_ring->total_packets = 0;
+	if (!itr_val)
+		itr_val = 0x4;
 
-	/* auto mask will automatically reenable the interrupt when we write
-	 * EICS */
-	if (!igb_clean_tx_irq(tx_ring))
-		/* Ring was not completely cleaned, so fire another interrupt */
-		wr32(E1000_EICS, tx_ring->eims_value);
+	if (q_vector->itr_shift)
+		itr_val |= itr_val << q_vector->itr_shift;
 	else
-		wr32(E1000_EIMS, tx_ring->eims_value);
+		itr_val |= 0x8000000;
 
-	return IRQ_HANDLED;
-}
-
-static void igb_write_itr(struct igb_ring *ring)
-{
-	struct e1000_hw *hw = &ring->adapter->hw;
-	if ((ring->adapter->itr_setting & 3) && ring->set_itr) {
-		switch (hw->mac.type) {
-		case e1000_82576:
-			wr32(ring->itr_register, ring->itr_val |
-			     0x80000000);
-			break;
-		default:
-			wr32(ring->itr_register, ring->itr_val |
-			     (ring->itr_val << 16));
-			break;
-		}
-		ring->set_itr = 0;
-	}
+	writel(itr_val, q_vector->itr_register);
+	q_vector->set_itr = 0;
 }
 
-static irqreturn_t igb_msix_rx(int irq, void *data)
+static irqreturn_t igb_msix_ring(int irq, void *data)
 {
-	struct igb_ring *rx_ring = data;
-
-	/* Write the ITR value calculated at the end of the
-	 * previous interrupt.
-	 */
+	struct igb_q_vector *q_vector = data;
 
-	igb_write_itr(rx_ring);
+	/* Write the ITR value calculated from the previous interrupt. */
+	igb_write_itr(q_vector);
 
-	if (napi_schedule_prep(&rx_ring->napi))
-		__napi_schedule(&rx_ring->napi);
+	napi_schedule(&q_vector->napi);
 
-#ifdef CONFIG_IGB_DCA
-	if (rx_ring->adapter->flags & IGB_FLAG_DCA_ENABLED)
-		igb_update_rx_dca(rx_ring);
-#endif
-		return IRQ_HANDLED;
+	return IRQ_HANDLED;
 }
 
 #ifdef CONFIG_IGB_DCA
-static void igb_update_rx_dca(struct igb_ring *rx_ring)
+static void igb_update_dca(struct igb_q_vector *q_vector)
 {
-	u32 dca_rxctrl;
-	struct igb_adapter *adapter = rx_ring->adapter;
+	struct igb_adapter *adapter = q_vector->adapter;
 	struct e1000_hw *hw = &adapter->hw;
 	int cpu = get_cpu();
-	int q = rx_ring->reg_idx;
 
-	if (rx_ring->cpu != cpu) {
-		dca_rxctrl = rd32(E1000_DCA_RXCTRL(q));
-		if (hw->mac.type == e1000_82576) {
-			dca_rxctrl &= ~E1000_DCA_RXCTRL_CPUID_MASK_82576;
-			dca_rxctrl |= dca3_get_tag(&adapter->pdev->dev, cpu) <<
-			              E1000_DCA_RXCTRL_CPUID_SHIFT;
+	if (q_vector->cpu == cpu)
+		goto out_no_update;
+
+	if (q_vector->tx_ring) {
+		int q = q_vector->tx_ring->reg_idx;
+		u32 dca_txctrl = rd32(E1000_DCA_TXCTRL(q));
+		if (hw->mac.type == e1000_82575) {
+			dca_txctrl &= ~E1000_DCA_TXCTRL_CPUID_MASK;
+			dca_txctrl |= dca3_get_tag(&adapter->pdev->dev, cpu);
 		} else {
+			dca_txctrl &= ~E1000_DCA_TXCTRL_CPUID_MASK_82576;
+			dca_txctrl |= dca3_get_tag(&adapter->pdev->dev, cpu) <<
+			              E1000_DCA_TXCTRL_CPUID_SHIFT;
+		}
+		dca_txctrl |= E1000_DCA_TXCTRL_DESC_DCA_EN;
+		wr32(E1000_DCA_TXCTRL(q), dca_txctrl);
+	}
+	if (q_vector->rx_ring) {
+		int q = q_vector->rx_ring->reg_idx;
+		u32 dca_rxctrl = rd32(E1000_DCA_RXCTRL(q));
+		if (hw->mac.type == e1000_82575) {
 			dca_rxctrl &= ~E1000_DCA_RXCTRL_CPUID_MASK;
 			dca_rxctrl |= dca3_get_tag(&adapter->pdev->dev, cpu);
+		} else {
+			dca_rxctrl &= ~E1000_DCA_RXCTRL_CPUID_MASK_82576;
+			dca_rxctrl |= dca3_get_tag(&adapter->pdev->dev, cpu) <<
+			              E1000_DCA_RXCTRL_CPUID_SHIFT;
 		}
 		dca_rxctrl |= E1000_DCA_RXCTRL_DESC_DCA_EN;
 		dca_rxctrl |= E1000_DCA_RXCTRL_HEAD_DCA_EN;
 		dca_rxctrl |= E1000_DCA_RXCTRL_DATA_DCA_EN;
 		wr32(E1000_DCA_RXCTRL(q), dca_rxctrl);
-		rx_ring->cpu = cpu;
-	}
-	put_cpu();
-}
-
-static void igb_update_tx_dca(struct igb_ring *tx_ring)
-{
-	u32 dca_txctrl;
-	struct igb_adapter *adapter = tx_ring->adapter;
-	struct e1000_hw *hw = &adapter->hw;
-	int cpu = get_cpu();
-	int q = tx_ring->reg_idx;
-
-	if (tx_ring->cpu != cpu) {
-		dca_txctrl = rd32(E1000_DCA_TXCTRL(q));
-		if (hw->mac.type == e1000_82576) {
-			dca_txctrl &= ~E1000_DCA_TXCTRL_CPUID_MASK_82576;
-			dca_txctrl |= dca3_get_tag(&adapter->pdev->dev, cpu) <<
-			              E1000_DCA_TXCTRL_CPUID_SHIFT;
-		} else {
-			dca_txctrl &= ~E1000_DCA_TXCTRL_CPUID_MASK;
-			dca_txctrl |= dca3_get_tag(&adapter->pdev->dev, cpu);
-		}
-		dca_txctrl |= E1000_DCA_TXCTRL_DESC_DCA_EN;
-		wr32(E1000_DCA_TXCTRL(q), dca_txctrl);
-		tx_ring->cpu = cpu;
 	}
+	q_vector->cpu = cpu;
+out_no_update:
 	put_cpu();
 }
 
@@ -3937,13 +4075,10 @@ static void igb_setup_dca(struct igb_adapter *adapter)
 	/* Always use CB2 mode, difference is masked in the CB driver. */
 	wr32(E1000_DCA_CTRL, E1000_DCA_CTRL_DCA_MODE_CB2);
 
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		adapter->tx_ring[i].cpu = -1;
-		igb_update_tx_dca(&adapter->tx_ring[i]);
-	}
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		adapter->rx_ring[i].cpu = -1;
-		igb_update_rx_dca(&adapter->rx_ring[i]);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		q_vector->cpu = -1;
+		igb_update_dca(q_vector);
 	}
 }
 
@@ -3972,7 +4107,7 @@ static int __igb_notify_dca(struct device *dev, void *data)
 	case DCA_PROVIDER_REMOVE:
 		if (adapter->flags & IGB_FLAG_DCA_ENABLED) {
 			/* without this a class_device is left
- 			 * hanging around in the sysfs model */
+			 * hanging around in the sysfs model */
 			dca_remove_requester(dev);
 			dev_info(&adapter->pdev->dev, "DCA disabled\n");
 			adapter->flags &= ~IGB_FLAG_DCA_ENABLED;
@@ -4379,15 +4514,15 @@ static void igb_set_uta(struct igb_adapter *adapter)
  **/
 static irqreturn_t igb_intr_msi(int irq, void *data)
 {
-	struct net_device *netdev = data;
-	struct igb_adapter *adapter = netdev_priv(netdev);
+	struct igb_adapter *adapter = data;
+	struct igb_q_vector *q_vector = adapter->q_vector[0];
 	struct e1000_hw *hw = &adapter->hw;
 	/* read ICR disables interrupts using IAM */
 	u32 icr = rd32(E1000_ICR);
 
-	igb_write_itr(adapter->rx_ring);
+	igb_write_itr(q_vector);
 
-	if(icr & E1000_ICR_DOUTSYNC) {
+	if (icr & E1000_ICR_DOUTSYNC) {
 		/* HW is reporting DMA is out of sync */
 		adapter->stats.doosync++;
 	}
@@ -4398,7 +4533,7 @@ static irqreturn_t igb_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	napi_schedule(&adapter->rx_ring[0].napi);
+	napi_schedule(&q_vector->napi);
 
 	return IRQ_HANDLED;
 }
@@ -4410,8 +4545,8 @@ static irqreturn_t igb_intr_msi(int irq, void *data)
  **/
 static irqreturn_t igb_intr(int irq, void *data)
 {
-	struct net_device *netdev = data;
-	struct igb_adapter *adapter = netdev_priv(netdev);
+	struct igb_adapter *adapter = data;
+	struct igb_q_vector *q_vector = adapter->q_vector[0];
 	struct e1000_hw *hw = &adapter->hw;
 	/* Interrupt Auto-Mask...upon reading ICR, interrupts are masked.  No
 	 * need for the IMC write */
@@ -4419,14 +4554,14 @@ static irqreturn_t igb_intr(int irq, void *data)
 	if (!icr)
 		return IRQ_NONE;  /* Not our interrupt */
 
-	igb_write_itr(adapter->rx_ring);
+	igb_write_itr(q_vector);
 
 	/* IMS will not auto-mask if INT_ASSERTED is not set, and if it is
 	 * not set, then the adapter didn't send an interrupt */
 	if (!(icr & E1000_ICR_INT_ASSERTED))
 		return IRQ_NONE;
 
-	if(icr & E1000_ICR_DOUTSYNC) {
+	if (icr & E1000_ICR_DOUTSYNC) {
 		/* HW is reporting DMA is out of sync */
 		adapter->stats.doosync++;
 	}
@@ -4438,26 +4573,26 @@ static irqreturn_t igb_intr(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	napi_schedule(&adapter->rx_ring[0].napi);
+	napi_schedule(&q_vector->napi);
 
 	return IRQ_HANDLED;
 }
 
-static inline void igb_rx_irq_enable(struct igb_ring *rx_ring)
+static inline void igb_ring_irq_enable(struct igb_q_vector *q_vector)
 {
-	struct igb_adapter *adapter = rx_ring->adapter;
+	struct igb_adapter *adapter = q_vector->adapter;
 	struct e1000_hw *hw = &adapter->hw;
 
 	if (adapter->itr_setting & 3) {
-		if (adapter->num_rx_queues == 1)
+		if (!adapter->msix_entries)
 			igb_set_itr(adapter);
 		else
-			igb_update_ring_itr(rx_ring);
+			igb_update_ring_itr(q_vector);
 	}
 
 	if (!test_bit(__IGB_DOWN, &adapter->state)) {
 		if (adapter->msix_entries)
-			wr32(E1000_EIMS, rx_ring->eims_value);
+			wr32(E1000_EIMS, q_vector->eims_value);
 		else
 			igb_irq_enable(adapter);
 	}
@@ -4470,28 +4605,28 @@ static inline void igb_rx_irq_enable(struct igb_ring *rx_ring)
  **/
 static int igb_poll(struct napi_struct *napi, int budget)
 {
-	struct igb_ring *rx_ring = container_of(napi, struct igb_ring, napi);
-	int work_done = 0;
+	struct igb_q_vector *q_vector = container_of(napi,
+	                                             struct igb_q_vector,
+	                                             napi);
+	int tx_clean_complete = 1, work_done = 0;
 
 #ifdef CONFIG_IGB_DCA
-	if (rx_ring->adapter->flags & IGB_FLAG_DCA_ENABLED)
-		igb_update_rx_dca(rx_ring);
+	if (q_vector->adapter->flags & IGB_FLAG_DCA_ENABLED)
+		igb_update_dca(q_vector);
 #endif
-	igb_clean_rx_irq_adv(rx_ring, &work_done, budget);
+	if (q_vector->tx_ring)
+		tx_clean_complete = igb_clean_tx_irq(q_vector);
 
-	if (rx_ring->buddy) {
-#ifdef CONFIG_IGB_DCA
-		if (rx_ring->adapter->flags & IGB_FLAG_DCA_ENABLED)
-			igb_update_tx_dca(rx_ring->buddy);
-#endif
-		if (!igb_clean_tx_irq(rx_ring->buddy))
-			work_done = budget;
-	}
+	if (q_vector->rx_ring)
+		igb_clean_rx_irq_adv(q_vector, &work_done, budget);
+
+	if (!tx_clean_complete)
+		work_done = budget;
 
 	/* If not enough Rx work done, exit the polling mode */
 	if (work_done < budget) {
 		napi_complete(napi);
-		igb_rx_irq_enable(rx_ring);
+		igb_ring_irq_enable(q_vector);
 	}
 
 	return work_done;
@@ -4533,12 +4668,13 @@ static void igb_tx_hwtstamp(struct igb_adapter *adapter, struct sk_buff *skb)
 
 /**
  * igb_clean_tx_irq - Reclaim resources after transmit completes
- * @adapter: board private structure
+ * @q_vector: pointer to q_vector containing needed info
  * returns true if ring is completely cleaned
  **/
-static bool igb_clean_tx_irq(struct igb_ring *tx_ring)
+static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 {
-	struct igb_adapter *adapter = tx_ring->adapter;
+	struct igb_adapter *adapter = q_vector->adapter;
+	struct igb_ring *tx_ring = q_vector->tx_ring;
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_hw *hw = &adapter->hw;
 	struct igb_buffer *buffer_info;
@@ -4646,25 +4782,21 @@ static bool igb_clean_tx_irq(struct igb_ring *tx_ring)
 
 /**
  * igb_receive_skb - helper function to handle rx indications
- * @ring: pointer to receive ring receving this packet
- * @status: descriptor status field as written by hardware
- * @rx_desc: receive descriptor containing vlan and type information.
- * @skb: pointer to sk_buff to be indicated to stack
+ * @q_vector: structure containing interrupt and ring information
+ * @skb: packet to send up
+ * @vlan_tag: vlan tag for packet
  **/
-static void igb_receive_skb(struct igb_ring *ring, u8 status,
-                            union e1000_adv_rx_desc * rx_desc,
-                            struct sk_buff *skb)
-{
-	struct igb_adapter * adapter = ring->adapter;
-	bool vlan_extracted = (adapter->vlgrp && (status & E1000_RXD_STAT_VP));
-
-	skb_record_rx_queue(skb, ring->queue_index);
-	if (vlan_extracted)
-		vlan_gro_receive(&ring->napi, adapter->vlgrp,
-		                 le16_to_cpu(rx_desc->wb.upper.vlan),
-		                 skb);
+static void igb_receive_skb(struct igb_q_vector *q_vector,
+                            struct sk_buff *skb,
+                            u16 vlan_tag)
+{
+	struct igb_adapter *adapter = q_vector->adapter;
+
+	if (vlan_tag)
+		vlan_gro_receive(&q_vector->napi, adapter->vlgrp,
+		                 vlan_tag, skb);
 	else
-		napi_gro_receive(&ring->napi, skb);
+		napi_gro_receive(&q_vector->napi, skb);
 }
 
 static inline void igb_rx_checksum_adv(struct igb_adapter *adapter,
@@ -4712,11 +4844,12 @@ static inline u16 igb_get_hlen(struct igb_adapter *adapter,
 	return hlen;
 }
 
-static bool igb_clean_rx_irq_adv(struct igb_ring *rx_ring,
-				 int *work_done, int budget)
+static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector,
+                                 int *work_done, int budget)
 {
-	struct igb_adapter *adapter = rx_ring->adapter;
+	struct igb_adapter *adapter = q_vector->adapter;
 	struct net_device *netdev = adapter->netdev;
+	struct igb_ring *rx_ring = q_vector->rx_ring;
 	struct e1000_hw *hw = &adapter->hw;
 	struct pci_dev *pdev = adapter->pdev;
 	union e1000_adv_rx_desc *rx_desc , *next_rxd;
@@ -4728,6 +4861,7 @@ static bool igb_clean_rx_irq_adv(struct igb_ring *rx_ring,
 	unsigned int i;
 	u32 staterr;
 	u16 length;
+	u16 vlan_tag;
 
 	i = rx_ring->next_to_clean;
 	buffer_info = &rx_ring->buffer_info[i];
@@ -4855,8 +4989,12 @@ send_up:
 		igb_rx_checksum_adv(adapter, staterr, skb);
 
 		skb->protocol = eth_type_trans(skb, netdev);
+		skb_record_rx_queue(skb, rx_ring->queue_index);
+
+		vlan_tag = ((staterr & E1000_RXD_STAT_VP) ?
+		            le16_to_cpu(rx_desc->wb.upper.vlan) : 0);
 
-		igb_receive_skb(rx_ring, staterr, rx_desc, skb);
+		igb_receive_skb(q_vector, skb, vlan_tag);
 
 next_desc:
 		rx_desc->wb.upper.status_error = 0;
@@ -4895,7 +5033,7 @@ next_desc:
 static void igb_alloc_rx_buffers_adv(struct igb_ring *rx_ring,
 				     int cleaned_count)
 {
-	struct igb_adapter *adapter = rx_ring->adapter;
+	struct igb_adapter *adapter = rx_ring->q_vector->adapter;
 	struct net_device *netdev = adapter->netdev;
 	struct pci_dev *pdev = adapter->pdev;
 	union e1000_adv_rx_desc *rx_desc;
@@ -5360,9 +5498,7 @@ static int __igb_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	if (netif_running(netdev))
 		igb_close(netdev);
 
-	igb_reset_interrupt_capability(adapter);
-
-	igb_free_queues(adapter);
+	igb_clear_interrupt_scheme(adapter);
 
 #ifdef CONFIG_PM
 	retval = pci_save_state(pdev);
@@ -5457,9 +5593,7 @@ static int igb_resume(struct pci_dev *pdev)
 	pci_enable_wake(pdev, PCI_D3hot, 0);
 	pci_enable_wake(pdev, PCI_D3cold, 0);
 
-	igb_set_interrupt_capability(adapter);
-
-	if (igb_alloc_queues(adapter)) {
+	if (igb_init_interrupt_scheme(adapter)) {
 		dev_err(&pdev->dev, "Unable to allocate memory for queues\n");
 		return -ENOMEM;
 	}
@@ -5511,22 +5645,16 @@ static void igb_netpoll(struct net_device *netdev)
 	int i;
 
 	if (!adapter->msix_entries) {
+		struct igb_q_vector *q_vector = adapter->q_vector[0];
 		igb_irq_disable(adapter);
-		napi_schedule(&adapter->rx_ring[0].napi);
+		napi_schedule(&q_vector->napi);
 		return;
 	}
 
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct igb_ring *tx_ring = &adapter->tx_ring[i];
-		wr32(E1000_EIMC, tx_ring->eims_value);
-		igb_clean_tx_irq(tx_ring);
-		wr32(E1000_EIMS, tx_ring->eims_value);
-	}
-
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct igb_ring *rx_ring = &adapter->rx_ring[i];
-		wr32(E1000_EIMC, rx_ring->eims_value);
-		napi_schedule(&rx_ring->napi);
+	for (i = 0; i < adapter->num_q_vectors; i++) {
+		struct igb_q_vector *q_vector = adapter->q_vector[i];
+		wr32(E1000_EIMC, q_vector->eims_value);
+		napi_schedule(&q_vector->napi);
 	}
 }
 #endif /* CONFIG_NET_POLL_CONTROLLER */


^ permalink raw reply related

* Re: [RFC PATCH] fib_hash: improve route deletion scaling on interface drop with lots of interfaces
From: David Miller @ 2009-10-28  1:01 UTC (permalink / raw)
  To: bcrl; +Cc: netdev
In-Reply-To: <20091027142426.GB3141@kvack.org>

From: Benjamin LaHaise <bcrl@lhnet.ca>
Date: Tue, 27 Oct 2009 10:24:26 -0400

> On Mon, Oct 26, 2009 at 05:17:49PM -0700, David Miller wrote:
>> > bottleneck.  Next up in the network code is rt_cache_flush().  Comments?
>> 
>> On a real router adding and removing routes is happening a lot
>> whereas interface changes are rare.  You're making a more common
>> operation more expensive for the sake of a less common one.
> 
> It's not a question of more common vs less common, but if the system can 
> recover from an adverse event within a reasonable amount of time.  Tunnel 
> flaps occur in the real world, and this results in the change of state of 
> a large number of interfaces at the same time.  Would it be okay if this 
> is wrapped in a config option?  I agree that the extra overhead is not 
> for everyone.

Having it in a config option is worse, distributions are going
to turn it on so it would be protecting nothing for %99.999 of
folks out there.

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Linus Torvalds @ 2009-10-28  0:58 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Stephen Hemminger, Andrew Morton, Octavian Purdila,
	netdev, linux-kernel, Al Viro
In-Reply-To: <20091027171035.39e18383@nehalam>

On Tue, 27 Oct 2009, Stephen Hemminger wrote:
> 
> Agreed. Here is the reduced version of the program.
> To run:
>   find /home -printf '%f\n' 2>/dev/null | ./htest -n 100

The timings are very sensitive to random I$ layout at least on Nehalem. 
The reason seems to be that the inner loop is _so_ tight that just 
depending on exactly where the loop ends up, you can get subtle 
interactions with the loop cache.

Look here:

	[torvalds@nehalem ~]$ find /home -printf '%f\n' 2>/dev/null | ./htest -n 100
	Algorithm             Time       Ratio       Max   StdDev
	full_name_hash       1.141899     1.03      4868 263.37
	djb2                 0.980200     1.03      4835 266.05
	string10             0.909175     1.03      4850 262.67
	string10a            0.673915     1.03      4850 262.67
	string10b            0.909374     1.03      4850 262.67
	string_hash17        0.966050     1.03      4805 263.68
	string_hash31        1.008544     1.03      4807 259.37
	fnv32                0.774806     1.03      4817 259.17

what do you think the difference between 'string10', 'string10a' and 
'string10b' are?

None. None what-so-ever. The source code is identical, and gcc generates 
identical assembly language. Yet those timings are extremely stable for 
me, and 'string10b' is 25% faster than the identical string10 and 
string10a functions.

The only difference? 'string10a' starts aligned to just 16 bytes, but that 
in turn happens to mean that the tight inner loop ends up aligned on a 
128-byte boundary. And being cacheline aligned just there seems to matters 
for some subtle micro-architectural reason.

The reason I noticed this is that I wondered what small modifications to 
'string10' would do for performance, and noticed that even _without_ the 
small modifications, performance fluctuated.

Lesson? Microbenchmarks like this can be dangerous and misleading. That's 
_especially_ true if the loop ends up being just tight enough that it can 
fit in some trace cache or similar. In real life, the name hash is 
performance-critical, but at the same time almost certainly won't be run 
in a tight enough loop that you'd ever notice things like that.

		Linus

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-28  0:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Stephen Hemminger, Andrew Morton, Octavian Purdila,
	netdev, linux-kernel, Al Viro
In-Reply-To: <alpine.LFD.2.01.0910271636100.31845@localhost.localdomain>

[-- Attachment #1: Type: text/plain, Size: 2345 bytes --]

On Tue, 27 Oct 2009 16:41:52 -0700 (PDT)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 27 Oct 2009, Stephen Hemminger wrote:
> > 
> > Going back to basics. Run tests across different input sets.
> > Dropping off the slow ones like crc, md5, ...
> > Not using jhash because it doesn't have good character at a time
> > interface.
> > 
> > Also, the folding algorithm used matters. Since the kernel
> > already uses hash_long() to fold back to N bits, all the
> > tests were rerun with that.
> 
> Yeah, the 'hash_long()' folding matters for anything that doesn't multiply 
> big some big number to spread the bits out, because otherwise the bits 
> from the last character hashed will always be in the low bits.
> 
> That explains why our current hash looked so bad with your previous code.
> 
> From your numbers, I think we can dismiss 'kr_hash()' as having horrible 
> behavior with names like pppXXX (and that isn't just a special case: it's 
> also noticeably worse for your /home directory case, which means that the 
> bad behavior shows up in practice too, not just in some special cases).
> 
> 'elf' and 'pjw' don't have quite the same bad case, but the stddev for the 
> pppXXX cases are still clearly worse than the other alternatives. They 
> also seem to always be slower than what we already have. 
> 
> The 'fnv32' algorithm gets fairly good behavior, but seems bad on Itanium. 
> Looks like it depends on a fast multiplication unit, and all even your 
> "slow" ULV chip seems to be a Core2 one, so all your x86 targets have 
> that. And our current name hash still actually seems to do better in all 
> cases (maybe I missed some case) even if fnv32 is slightly faster on x86.
> 
> From your list 'string10' seems to get consistently good results and is at 
> or near the top of performance too. It seems to be the one that 
> consistently beats 'full_name_hash()' both in performance and in behavior 
> (string_hash17/31 come close, but aren't as clearly better performing).
> 
> But I haven't actually seen the hashes. Maybe there's something that makes 
> string10 bad?
> 
> Regardless, one thing your new numbers do say is that our current hash 
> really isn't that bad.
> 
> 		Linus

Agreed. Here is the reduced version of the program.
To run:
  find /home -printf '%f\n' 2>/dev/null | ./htest -n 100

[-- Attachment #2: htest.c --]
[-- Type: text/x-c++src, Size: 5219 bytes --]

#include <stdio.h>
#include <stdint.h>
#include <malloc.h>
#include <sys/types.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <getopt.h>

#define init_name_hash()                0

/* partial hash update function. Assume roughly 4 bits per character */
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
        return (prevhash + (c << 4) + (c >> 4)) * 11;
}

/* Compute the hash for a name string. */
static unsigned int
full_name_hash(const unsigned char *name, unsigned int len)
{
        unsigned long hash = 0;
        while (len--)
                hash = partial_name_hash(*name++, hash);
        return hash;
}

static unsigned int
djb2(const unsigned char *str, unsigned int len)
{
	unsigned long hash = 5381;

        while (len--)
		hash = ((hash << 5) + hash) + *str++; /* hash * 33 + c */

        return hash;
}

static unsigned int
string_hash31(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
                hash = 31 * hash + name[i];
        return hash;
}

static unsigned int
string_hash17(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
                hash = 17 * hash + name[i];
        return hash;
}

static unsigned int 
string10(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
		hash = hash * 10 + name[i] - '0';

        return hash;
}

static const uint32_t FNV_32_PRIME = 16777619u;
static const uint32_t FNV1_32_INIT = 2166136261u;

static unsigned int fnv32(const unsigned char *key, unsigned int len)
{
	uint32_t hval = FNV1_32_INIT;
	unsigned int i;

	for (i = 0; i < len; i++) {
		hval ^= key[i];
#if defined(NO_FNV_GCC_OPTIMIZATION)
		hval *= FNV_32_PRIME;
#else
		hval += (hval<<1) + (hval<<4) + (hval<<7)
			+ (hval<<8) + (hval<<24);
#endif
	}

	return hval;
}

static unsigned order = 8;
static unsigned iterations = 10000;
static char **names;
static unsigned num_names;

static void read_names(void)
{
	char line[1024];
	unsigned int n = 0;
	
	/* Read input to create name set */
	while (fgets(line, sizeof line, stdin) != NULL) {
		names = realloc(names, (n + 1) * sizeof(char *));
		names[n] = malloc(strlen(line) + 1);
		strcpy(names[n], line);
		++n;
	}
	num_names = n;
}

/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL

/* Unrolled version of hash_long() */
static inline unsigned int fold(unsigned int val, unsigned int bits)
{
	if (sizeof(val) == 8) {
		uint64_t hash = val;
		/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
		uint64_t n = hash;
		n <<= 18;
		hash -= n;
		n <<= 33;
		hash -= n;
		n <<= 3;
		hash += n;
		n <<= 3;
		hash -= n;
		n <<= 4;
		hash += n;
		n <<= 2;
		hash += n;

		/* High bits are more random, so use them. */
		return hash >> (64 - bits);
	} else {
		/* On some cpus multiply is faster, on others gcc will do shifts */
		uint32_t hash = val * GOLDEN_RATIO_PRIME_32;

		/* High bits are more random, so use them. */
		return hash >> (32 - bits);
	}
}

static void measure(const char *name,  
		    unsigned int (*hash)(const unsigned char *, unsigned int))
{
	unsigned int i;
	struct timeval t0, t1;
	unsigned int hashsz = 1 << order;
	unsigned int dist[hashsz];
	unsigned long long probes = 0;
	double t;
	unsigned long m = 0;
	const double avg = num_names / hashsz;
	double ideal = hashsz * (avg * (1 + avg)) / 2;
	double std = 0;

	memset(dist, 0, sizeof(unsigned int) * hashsz);

	gettimeofday(&t0, NULL);
	for (i = 0; i < num_names; i++) {
		unsigned char *name = (unsigned char *) names[i];
		size_t len = strlen(names[i]);
		unsigned int h = 0, j;

		for (j = 0; j < iterations; j++)
			h = hash(name, len);

		/* fold in extra bits */
		++dist[fold(h, order)];
	}
	gettimeofday(&t1, NULL);
	t = (double) (t1.tv_sec - t0.tv_sec);
	t += (double) (t1.tv_usec - t0.tv_usec) / 1000000.;

	for (i = 0; i < hashsz; i++) {
		unsigned int n = dist[i];
		double delta = (n - avg);

		if (n > m) m = n;	/* longest chain */
		
		std += delta * delta;	/* compute standard deviation */
		
		/* number of probes used when accessing 
		   is same as sum of arithmetic series */
		probes += ((uint64_t) n * (1 + n)) / 2;
	}


	printf("%-20s %f", name, t);
	printf(" %8.2f %9lu %6.2f\n",
	       (double) probes / ideal, m, sqrt(std / hashsz));

}
#define MEASURE(func)	measure(#func, &func)

int main(int argc, char **argv)
{
	int f;

	while ((f = getopt(argc, argv, "h:n:")) != -1)
		switch (f) {
		case 'n':
			iterations = strtoul(optarg, NULL, 0);
			break;

		case 'h':
			order = strtoul(optarg, NULL, 0);
			break;

		default:
			fprintf(stderr, 
				"Usage: %s -a -h hash -n testsize\n",
				argv[0]);
			return 1;
		}

	read_names();

	printf("Algorithm             Time       Ratio       Max   StdDev\n");

	MEASURE(full_name_hash);
	MEASURE(djb2);
	MEASURE(string10);
	MEASURE(string_hash17);
	MEASURE(string_hash31);

	MEASURE(fnv32);

        return 0;
}

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Linus Torvalds @ 2009-10-27 23:41 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Stephen Hemminger, Andrew Morton, Octavian Purdila,
	netdev, linux-kernel, Al Viro
In-Reply-To: <20091027160822.384dc109@nehalam>

On Tue, 27 Oct 2009, Stephen Hemminger wrote:
> 
> Going back to basics. Run tests across different input sets.
> Dropping off the slow ones like crc, md5, ...
> Not using jhash because it doesn't have good character at a time
> interface.
> 
> Also, the folding algorithm used matters. Since the kernel
> already uses hash_long() to fold back to N bits, all the
> tests were rerun with that.

Yeah, the 'hash_long()' folding matters for anything that doesn't multiply 
big some big number to spread the bits out, because otherwise the bits 
from the last character hashed will always be in the low bits.

That explains why our current hash looked so bad with your previous code.

>From your numbers, I think we can dismiss 'kr_hash()' as having horrible 
behavior with names like pppXXX (and that isn't just a special case: it's 
also noticeably worse for your /home directory case, which means that the 
bad behavior shows up in practice too, not just in some special cases).

'elf' and 'pjw' don't have quite the same bad case, but the stddev for the 
pppXXX cases are still clearly worse than the other alternatives. They 
also seem to always be slower than what we already have. 

The 'fnv32' algorithm gets fairly good behavior, but seems bad on Itanium. 
Looks like it depends on a fast multiplication unit, and all even your 
"slow" ULV chip seems to be a Core2 one, so all your x86 targets have 
that. And our current name hash still actually seems to do better in all 
cases (maybe I missed some case) even if fnv32 is slightly faster on x86.

>From your list 'string10' seems to get consistently good results and is at 
or near the top of performance too. It seems to be the one that 
consistently beats 'full_name_hash()' both in performance and in behavior 
(string_hash17/31 come close, but aren't as clearly better performing).

But I haven't actually seen the hashes. Maybe there's something that makes 
string10 bad?

Regardless, one thing your new numbers do say is that our current hash 
really isn't that bad.

		Linus

^ permalink raw reply

* Re: performance regression in virtio-net in 2.6.32-rc4
From: Rusty Russell @ 2009-10-27 23:34 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, kvm, netdev
In-Reply-To: <20091026184835.GB26473@redhat.com>

On Tue, 27 Oct 2009 05:18:35 am Michael S. Tsirkin wrote:
> Hi!
> I noticed a performance regression in virtio net: going from
> 2.6.31 to 2.6.32-rc4 I see this, for guest to host communication:
...
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.20    7806.48
...
>  87380  16384  16384    10.00    6814.60

Hmm, needs a bisect I'd say.

Thanks,
Rusty.

^ permalink raw reply

* Re: [PATCH 0/5] Candidate fix for increased number of GFP_ATOMIC failures V2
From: reinette chatre @ 2009-10-27 23:34 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Frans Pop, Jiri Kosina, Sven Geggus, Karol Lewandowski,
	Tobias Oetiker, Rafael J. Wysocki, David Miller, Kalle Valo,
	David Rientjes, KOSAKI Motohiro, Abbas, Mohamed, Jens Axboe,
	John W. Linville, Pekka Enberg, Bartlomiej Zolnierkiewicz,
	Greg Kroah-Hartman, Stephan von Krawczynski, Kernel Testers List,
	netdev@vger.kernel.org, linux-kernel@ vger.kernel.org,
	linux-mm@kvack.org
In-Reply-To: <20091027104017.GC8900@csn.ul.ie>

On Tue, 2009-10-27 at 03:40 -0700, Mel Gorman wrote:
> On Thu, Oct 22, 2009 at 08:43:38AM -0700, reinette chatre wrote:
> > On Thu, 2009-10-22 at 07:22 -0700, Mel Gorman wrote:
> > > [Bug #14141] order 2 page allocation failures in iwlagn
> > > 	Commit 4752c93c30441f98f7ed723001b1a5e3e5619829 introduced GFP_ATOMIC
> > > 	allocations within the wireless driver. This has caused large numbers
> > > 	of failure reports to occur as reported by Frans Pop. Fixing this
> > > 	requires changes to the driver if it wants to use GFP_ATOMIC which
> > > 	is in the hands of Mohamed Abbas and Reinette Chatre. However,
> > > 	it is very likely that it has being compounded by core mm changes
> > > 	that this series is aimed at.
> > 
> > Driver has been changed to allocate paged skb for its receive buffers.
> > This reduces amount of memory needed from order-2 to order-1. This work
> > is significant and will thus be in 2.6.33. 
> > 
> 
> What do you want to do for -stable in 2.6.31?
> 

I have just posted two patches to stable. The first is to address a bug
in which buffer loss occurs when there is an allocation failure and the
second is what Frans has been testing that reduces the noise when these
allocations fail. They are:

  iwlwifi: fix potential rx buffer loss
de0bd50845eb5935ce3d503c5d2f565d6cb9ece1 in linux-2.6
  iwlwifi: reduce noise when skb allocation fails
f82a924cc88a5541df1d4b9d38a0968cd077a051 in linux-2.6

Reinette


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: iproute uses too small of a receive buffer
From: Ben Greear @ 2009-10-27 23:30 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: NetDev
In-Reply-To: <20091027162434.6dc31b2d@nehalam>

On 10/27/2009 04:24 PM, Stephen Hemminger wrote:
> On Tue, 27 Oct 2009 16:16:52 -0700
> Ben Greear<greearb@candelatech.com>  wrote:
>
>> I have a very busy system with a bunch of xorp router processes (mis)configured.
>>
>> This thing is rapidly making route changes for whatever reason.
>>
>> The 'ip monitor route' command was failing:
>>
>> [root@i7-dqc-1 ]# ip monitor route
>> netlink receive error No buffer space available (105)
>> Dump terminated
>>
>>
>> It is only using a 32k rcv buffer, and it seems the OS was
>> overdriving it.
>>
>> Please consider making the rcv buffer larger, perhaps something
>> like this (inline is white-space damaged...attachment should apply
>> if deemed useful.):
>>
>> Signed-off-by:  Ben Greear<greearb@candelatech.com>
>>
>> diff --git a/lib/libnetlink.c b/lib/libnetlink.c
>> index b68e2fd..95a7d1d 100644
>> --- a/lib/libnetlink.c
>> +++ b/lib/libnetlink.c
>> @@ -38,7 +38,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions,
>>    {
>>           socklen_t addr_len;
>>           int sndbuf = 32768;
>> -       int rcvbuf = 32768;
>> +       int rcvbuf = 3276800;
>>
>>           memset(rth, 0, sizeof(*rth));
>>
>>
>> Thanks,
>> Ben
>>
>
> Just having larger buffer isn't guarantee of success. Allocating
> a huge buffer is not going to work on embedded.
>
> Why not have it continue after one error.

Probably the right way is to give a cmd-line arg to set the buffer size
and also continue if the error is ENOBUFs (but print some error out
so users know they have issues).  I can make the attempt if that
sounds good to you.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: iproute uses too small of a receive buffer
From: Stephen Hemminger @ 2009-10-27 23:24 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev
In-Reply-To: <4AE77F64.3090302@candelatech.com>

On Tue, 27 Oct 2009 16:16:52 -0700
Ben Greear <greearb@candelatech.com> wrote:

> I have a very busy system with a bunch of xorp router processes (mis)configured.
> 
> This thing is rapidly making route changes for whatever reason.
> 
> The 'ip monitor route' command was failing:
> 
> [root@i7-dqc-1 ]# ip monitor route
> netlink receive error No buffer space available (105)
> Dump terminated
> 
> 
> It is only using a 32k rcv buffer, and it seems the OS was
> overdriving it.
> 
> Please consider making the rcv buffer larger, perhaps something
> like this (inline is white-space damaged...attachment should apply
> if deemed useful.):
> 
> Signed-off-by:  Ben Greear <greearb@candelatech.com>
> 
> diff --git a/lib/libnetlink.c b/lib/libnetlink.c
> index b68e2fd..95a7d1d 100644
> --- a/lib/libnetlink.c
> +++ b/lib/libnetlink.c
> @@ -38,7 +38,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions,
>   {
>          socklen_t addr_len;
>          int sndbuf = 32768;
> -       int rcvbuf = 32768;
> +       int rcvbuf = 3276800;
> 
>          memset(rth, 0, sizeof(*rth));
> 
> 
> Thanks,
> Ben
> 

Just having larger buffer isn't guarantee of success. Allocating
a huge buffer is not going to work on embedded.

Why not have it continue after one error.

-- 

^ permalink raw reply

* Re: [PATCH] Multicast packet reassembly can fail
From: Rick Jones @ 2009-10-27 23:22 UTC (permalink / raw)
  To: Steve Chen; +Cc: netdev
In-Reply-To: <1256683583.3153.389.camel@linux-1lbu>

Steve Chen wrote:
> Multicast packet reassembly can fail
> 
> When multicast connections with multiple fragments are received by the same
> node from more than one Ethernet ports, race condition between fragments
> from each Ethernet port can cause fragment reassembly to fail leading to
> packet drop.  This is because packets from each Ethernet port appears identical
> to the the code that reassembles the Ethernet packet.
> 
> The solution is evaluate the Ethernet interface number in addition to all other
> parameters so that every packet can be uniquely identified.  The existing
> iif field in struct ipq is now used to generate the hash key, and iif is also
> used for comparison in case of hash collision.
> 
> Please note that q->saddr ^ (q->iif << 5) is now being passed into
> ipqhashfn to generate the hash key.  This is borrowed from the routing
> code.
> 
> Signed-off-by: Steve Chen <schen@mvista.com>
> Signed-off-by: Mark Huth <mhuth@mvista.com>

It has been hours since my last good Emily Litella moment so I'll ask - isn't 
the combination of source and dest addr, protocol, IP ID and fragment offset 
supposed to take care of this?  How does the ingress interface have anything to 
do with it?

rick jones

^ permalink raw reply

* iproute uses too small of a receive buffer
From: Ben Greear @ 2009-10-27 23:16 UTC (permalink / raw)
  To: NetDev

[-- Attachment #1: Type: text/plain, Size: 1065 bytes --]

I have a very busy system with a bunch of xorp router processes (mis)configured.

This thing is rapidly making route changes for whatever reason.

The 'ip monitor route' command was failing:

[root@i7-dqc-1 ]# ip monitor route
netlink receive error No buffer space available (105)
Dump terminated


It is only using a 32k rcv buffer, and it seems the OS was
overdriving it.

Please consider making the rcv buffer larger, perhaps something
like this (inline is white-space damaged...attachment should apply
if deemed useful.):

Signed-off-by:  Ben Greear <greearb@candelatech.com>

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index b68e2fd..95a7d1d 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -38,7 +38,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions,
  {
         socklen_t addr_len;
         int sndbuf = 32768;
-       int rcvbuf = 32768;
+       int rcvbuf = 3276800;

         memset(rth, 0, sizeof(*rth));


Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


[-- Attachment #2: iputils.patch --]
[-- Type: text/plain, Size: 343 bytes --]

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index b68e2fd..95a7d1d 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -38,7 +38,7 @@ int rtnl_open_byproto(struct rtnl_handle *rth, unsigned subscriptions,
 {
 	socklen_t addr_len;
 	int sndbuf = 32768;
-	int rcvbuf = 32768;
+	int rcvbuf = 3276800;
 
 	memset(rth, 0, sizeof(*rth));
 

^ permalink raw reply related

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-27 23:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Stephen Hemminger, Andrew Morton, Octavian Purdila,
	netdev, linux-kernel, Al Viro
In-Reply-To: <alpine.LFD.2.01.0910271017170.31845@localhost.localdomain>

On Tue, 27 Oct 2009 10:32:44 -0700 (PDT)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 27 Oct 2009, Stephen Hemminger wrote:
> > 
> > Rather than wasting space, or doing expensive, modulus; just folding
> > the higher bits back with XOR redistributes the bits better.
> 
> Please don't make up any new hash functions without having a better input 
> set than the one you seem to use.
> 
> The 'fnv' function I can believe in, because the whole "multiply by big 
> prime number" thing to spread out the bits is a very traditional model. 
> But making up a new hash function based on essentially consecutive names 
> is absolutely the wrong thing to do. You need a much better corpus of path 
> component names for testing.
> 
> > The following seems to give best results (combination of 16bit trick
> > and string17).
> 
> .. and these kinds of games are likely to work badly on some 
> architectures. Don't use 16-bit values, and don't use 'get_unaligned()'. 
> Both tend to work fine on x86, but likely suck on some other 
> architectures.
> 
> Also remember that the critical hash function needs to check for '/' and 
> '\0' while at it, which is one reason why it does things byte-at-a-time. 
> If you try to be smart, you'd need to be smart about the end condition 
> too.
> 
> The loop to optimize is _not_ based on 'name+len', it is this code:
> 
>                 this.name = name;
>                 c = *(const unsigned char *)name;
> 
>                 hash = init_name_hash();
>                 do {
>                         name++;
>                         hash = partial_name_hash(c, hash);
>                         c = *(const unsigned char *)name;
>                 } while (c && (c != '/'));
>                 this.len = name - (const char *) this.name;
>                 this.hash = end_name_hash(hash);
> 
> (which depends on us having already removed all slashed at the head, and 
> knowing that the string is not zero-sized)
> 
> So doing things multiple bytes at a time is certainly still possible, but 
> you would always have to find the slashes/NUL's in there first. Doing that 
> efficiently and portably is not trivial - especially since a lot of 
> critical path components are short.
> 
> (Remember: there may be just a few 'bin' directory names, but if you do 
> performance analysis, 'bin' as a path component is probably hashed a lot 
> more than 'five_slutty_bimbos_and_a_donkey.jpg'. So the relative weighting 
> of importance of the filename should probably include the frequency it 
> shows up in pathname lookup)
> 
> 		Linus


Going back to basics. Run tests across different input sets.
Dropping off the slow ones like crc, md5, ...
Not using jhash because it doesn't have good character at a time
interface.

Also, the folding algorithm used matters. Since the kernel
already uses hash_long() to fold back to N bits, all the
tests were rerun with that.

Test run across names in /usr
Algorithm             Time       Ratio       Max   StdDev
kr_hash              2.481275     1.21      4363 358.98
string10             2.834562     1.15      4168 303.66
fnv32                2.887600     1.18      4317 332.38
string_hash31        3.655745     1.16      4258 314.33
string_hash17        3.816443     1.16      4177 311.28
djb2                 3.883914     1.18      4269 331.75
full_name_hash       4.067633     1.16      4282 312.29
pjw                  6.517316     1.17      4184 316.69
sdbm                 6.945385     1.17      4447 324.32
elf                  7.402180     1.17      4184 316.69


And in /home (mail directories and git)
Algorithm             Time       Ratio       Max   StdDev
kr_hash              2.765015     1.44      7175 701.99
string10             3.136947     1.19      7092 469.73
fnv32                3.162626     1.19      6986 458.48
string_hash31        3.832031     1.19      7053 463.29
string_hash17        4.136220     1.19      7023 469.30
djb2                 4.241706     1.23      7537 512.02
full_name_hash       4.437741     1.19      7000 467.19
pjw                  6.758093     1.20      6970 476.03
sdbm                 7.239758     1.22      7526 494.32
elf                  7.446356     1.20      6970 476.03


And with names like pppXXX
Algorithm             Time       Ratio       Max   StdDev
kr_hash              0.849656     9.26      5520 1121.79
fnv32                1.004682     1.01       453  23.29
string10             1.004729     1.00       395   2.08
string_hash31        1.108335     1.00       409   5.14
string_hash17        1.231257     1.00       410   8.10
djb2                 1.238314     1.01       435  29.88
full_name_hash       1.320822     1.00       422  11.07
elf                  1.994794     1.15       716 151.19
pjw                  2.063958     1.15       716 151.19
sdbm                 2.070033     1.00       408   8.11

* The new test has big table so more cache effects.
* Existing full_name_hash distributes okay if folded correctly.
* fnv32 and string10 are slightly faster

More data (on /usr) from older slower machines:

IA-64
Algorithm             Time       Ratio       Max   StdDev
kr_hash              1.676064     1.17       664  63.81
string_hash17        1.773553     1.12       616  54.40
djb2                 2.103359     1.12       598  54.71
string10             2.103959     1.13       698  56.80
string_hash31        2.108254     1.13       602  55.51
full_name_hash       3.237209     1.13       614  56.74
sdbm                 3.279243     1.12       611  54.78
pjw                  3.314135     1.13       639  56.74
elf                  3.821029     1.13       639  56.74
fnv32                5.619829     1.16       865  62.51

Slow ULV 1Ghz laptop:
Algorithm             Time       Ratio       Max   StdDev
kr_hash              5.754460     1.19      2017 194.64
string10             6.698358     1.15      1638 171.29
sdbm                 8.134431     1.15      1652 170.65
djb2                 8.231058     1.17      1659 184.44
string_hash31        8.447873     1.15      1633 172.13
fnv32                8.552569     1.18      2170 189.61
string_hash17        9.226992     1.16      1616 175.01
full_name_hash       10.555072     1.15      1703 170.45
pjw                  16.193485     1.17      1642 181.45
elf                  19.770414     1.17      1642 181.45

^ permalink raw reply

* [PATCH] Multicast packet reassembly can fail
From: Steve Chen @ 2009-10-27 22:46 UTC (permalink / raw)
  To: netdev

Multicast packet reassembly can fail

When multicast connections with multiple fragments are received by the same
node from more than one Ethernet ports, race condition between fragments
from each Ethernet port can cause fragment reassembly to fail leading to
packet drop.  This is because packets from each Ethernet port appears identical
to the the code that reassembles the Ethernet packet.

The solution is evaluate the Ethernet interface number in addition to all other
parameters so that every packet can be uniquely identified.  The existing
iif field in struct ipq is now used to generate the hash key, and iif is also
used for comparison in case of hash collision.

Please note that q->saddr ^ (q->iif << 5) is now being passed into
ipqhashfn to generate the hash key.  This is borrowed from the routing
code.

Signed-off-by: Steve Chen <schen@mvista.com>
Signed-off-by: Mark Huth <mhuth@mvista.com>

---

 net/ipv4/ip_fragment.c |   24 +++++++++++++++++-------
 1 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 575f9bd..2de0035 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -90,6 +90,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 struct ip4_create_arg {
 	struct iphdr *iph;
 	u32 user;
+	int iif;
 };
 
 static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
@@ -104,7 +105,8 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
 	struct ipq *ipq;
 
 	ipq = container_of(q, struct ipq, q);
-	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
+	return ipqhashfn(ipq->id, ipq->saddr ^ (ipq->iif << 5), ipq->daddr,
+			 ipq->protocol);
 }
 
 static int ip4_frag_match(struct inet_frag_queue *q, void *a)
@@ -117,6 +119,7 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
 			qp->saddr == arg->iph->saddr &&
 			qp->daddr == arg->iph->daddr &&
 			qp->protocol == arg->iph->protocol &&
+			qp->iif == arg->iif &&
 			qp->user == arg->user);
 }
 
@@ -140,6 +143,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 	qp->saddr = arg->iph->saddr;
 	qp->daddr = arg->iph->daddr;
 	qp->user = arg->user;
+	qp->iif = arg->iif;
 	qp->peer = sysctl_ipfrag_max_dist ?
 		inet_getpeer(arg->iph->saddr, 1) : NULL;
 }
@@ -219,7 +223,8 @@ out:
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
-static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user,
+				  int iif)
 {
 	struct inet_frag_queue *q;
 	struct ip4_create_arg arg;
@@ -227,9 +232,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
 
 	arg.iph = iph;
 	arg.user = user;
+	arg.iif = iif;
 
 	read_lock(&ip4_frags.lock);
-	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+	hash = ipqhashfn(iph->id, iph->saddr & (iif << 5), iph->daddr,
+			 iph->protocol);
 
 	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
 	if (q == NULL)
@@ -433,10 +440,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		qp->q.fragments = skb;
 
 	dev = skb->dev;
-	if (dev) {
-		qp->iif = dev->ifindex;
+	if (dev)
 		skb->dev = NULL;
-	}
+
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	atomic_add(skb->truesize, &qp->q.net->mem);
@@ -572,6 +578,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 {
 	struct ipq *qp;
 	struct net *net;
+	int iif  = 0;
 
 	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
@@ -580,8 +587,12 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
 		ip_evictor(net);
 
+	if (skb->dev)
+		iif = skb->dev->ifindex;
+
 	/* Lookup (or create) queue header */
-	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
+	qp = ip_find(net, ip_hdr(skb), user, iif);
+	if (qp != NULL) {
 		int ret;
 
 		spin_lock(&qp->q.lock);



^ permalink raw reply related

* RE: [PATCH -next] netxen: fix builds for SYSFS=n or MODULES=n
From: Dhananjay Phadke @ 2009-10-27 22:09 UTC (permalink / raw)
  To: Randy Dunlap, Stephen Rothwell, netdev
  Cc: linux-next@vger.kernel.org, LKML, davem@davemloft.net
In-Reply-To: <20091026150945.3f35a811.randy.dunlap@oracle.com>

Sorry, seems like I am keeping you busy with the build errors.
Thanks for fixing it again (may be I need to setup all different build configs).

Acked-by: Dhananjay Phadke <dhananjay@netxen.com>

-----Original Message-----
From: Randy Dunlap [mailto:randy.dunlap@oracle.com] 
Sent: Monday, October 26, 2009 3:10 PM
To: Stephen Rothwell; netdev
Cc: linux-next@vger.kernel.org; LKML; davem@davemloft.net; Dhananjay Phadke
Subject: [PATCH -next] netxen: fix builds for SYSFS=n or MODULES=n

From: Randy Dunlap <randy.dunlap@oracle.com>

When CONFIG_MODULES=n:
drivers/net/netxen/netxen_nic_main.c:2751: error: dereferencing pointer to incomplete type
drivers/net/netxen/netxen_nic_main.c:2764: error: dereferencing pointer to incomplete type

Also needs addition of <linux/sysfs.h> for sysfs function prototypes or
stubs when CONFIG_SYSFS=n.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 drivers/net/netxen/netxen_nic_main.c |    9 +++++++++
 1 file changed, 9 insertions(+)

--- linux-next-20091026.orig/drivers/net/netxen/netxen_nic_main.c
+++ linux-next-20091026/drivers/net/netxen/netxen_nic_main.c
@@ -34,6 +34,7 @@
 #include <net/ip.h>
 #include <linux/ipv6.h>
 #include <linux/inetdevice.h>
+#include <linux/sysfs.h>
 
 MODULE_DESCRIPTION("NetXen Multi port (1/10) Gigabit Network Driver");
 MODULE_LICENSE("GPL");
@@ -2500,6 +2501,7 @@ static struct bin_attribute bin_attr_mem
 	.write = netxen_sysfs_write_mem,
 };
 
+#ifdef CONFIG_MODULES
 static ssize_t
 netxen_store_auto_fw_reset(struct module_attribute *mattr,
 		struct module *mod, const char *buf, size_t count)
@@ -2534,6 +2536,7 @@ static struct module_attribute mod_attr_
 	.show = netxen_show_auto_fw_reset,
 	.store = netxen_store_auto_fw_reset,
 };
+#endif
 
 static void
 netxen_create_sysfs_entries(struct netxen_adapter *adapter)
@@ -2739,7 +2742,9 @@ static struct pci_driver netxen_driver =
 
 static int __init netxen_init_module(void)
 {
+#ifdef CONFIG_MODULES
 	struct module *mod = THIS_MODULE;
+#endif
 
 	printk(KERN_INFO "%s\n", netxen_nic_driver_string);
 
@@ -2748,9 +2753,11 @@ static int __init netxen_init_module(voi
 	register_inetaddr_notifier(&netxen_inetaddr_cb);
 #endif
 
+#ifdef CONFIG_MODULES
 	if (sysfs_create_file(&mod->mkobj.kobj, &mod_attr_fw_reset.attr))
 		printk(KERN_ERR "%s: Failed to create auto_fw_reset "
 				"sysfs entry.", netxen_nic_driver_name);
+#endif
 
 	return pci_register_driver(&netxen_driver);
 }
@@ -2759,9 +2766,11 @@ module_init(netxen_init_module);
 
 static void __exit netxen_exit_module(void)
 {
+#ifdef CONFIG_MODULES
 	struct module *mod = THIS_MODULE;
 
 	sysfs_remove_file(&mod->mkobj.kobj, &mod_attr_fw_reset.attr);
+#endif
 
 	pci_unregister_driver(&netxen_driver);
 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox