Netdev List
 help / color / mirror / Atom feed
* [net-next 11/12] ixgbe: further flow director performance optimizations
From: jeffrey.t.kirsher @ 2011-01-07  0:29 UTC (permalink / raw)
  To: davem, davem; +Cc: Alexander Duyck, netdev, gosp, bphilips, Jeff Kirsher
In-Reply-To: <1294360199-9860-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change adds a compressed input type for atr signature hash
computation.  It also drops the use of the set functions when setting up
the ATR input since we can then directly setup the hash input as two dwords
that can be stored and passed as registers.

With these changes the cost of computing the has is low enough that we can
perform a hash computation on each TCP SYN flagged packet allowing us to
drop the number of flow director misses considerably in tests such as
netperf TCP_CRR.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Stephen Ko <stephen.s.ko@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe.h       |    3 +-
 drivers/net/ixgbe/ixgbe_82599.c |  112 ++++++++++++++++++++++++++++++++++-----
 drivers/net/ixgbe/ixgbe_main.c  |  107 +++++++++++++++++++++++++++----------
 drivers/net/ixgbe/ixgbe_type.h  |   16 ++++++
 4 files changed, 194 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 2666e69..341b3db 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -526,7 +526,8 @@ extern s32 ixgbe_reinit_fdir_tables_82599(struct ixgbe_hw *hw);
 extern s32 ixgbe_init_fdir_signature_82599(struct ixgbe_hw *hw, u32 pballoc);
 extern s32 ixgbe_init_fdir_perfect_82599(struct ixgbe_hw *hw, u32 pballoc);
 extern s32 ixgbe_fdir_add_signature_filter_82599(struct ixgbe_hw *hw,
-                                                 union ixgbe_atr_input *input,
+						 union ixgbe_atr_hash_dword input,
+						 union ixgbe_atr_hash_dword common,
                                                  u8 queue);
 extern s32 ixgbe_fdir_add_perfect_filter_82599(struct ixgbe_hw *hw,
                                       union ixgbe_atr_input *input,
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index 40aa3c2..d41931f 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -1331,6 +1331,96 @@ static u32 ixgbe_atr_compute_hash_82599(union ixgbe_atr_input *atr_input,
 	return hash_result & IXGBE_ATR_HASH_MASK;
 }
 
+/*
+ * These defines allow us to quickly generate all of the necessary instructions
+ * in the function below by simply calling out IXGBE_COMPUTE_SIG_HASH_ITERATION
+ * for values 0 through 15
+ */
+#define IXGBE_ATR_COMMON_HASH_KEY \
+		(IXGBE_ATR_BUCKET_HASH_KEY & IXGBE_ATR_SIGNATURE_HASH_KEY)
+#define IXGBE_COMPUTE_SIG_HASH_ITERATION(_n) \
+do { \
+	u32 n = (_n); \
+	if (IXGBE_ATR_COMMON_HASH_KEY & (0x01 << n)) \
+		common_hash ^= lo_hash_dword >> n; \
+	else if (IXGBE_ATR_BUCKET_HASH_KEY & (0x01 << n)) \
+		bucket_hash ^= lo_hash_dword >> n; \
+	else if (IXGBE_ATR_SIGNATURE_HASH_KEY & (0x01 << n)) \
+		sig_hash ^= lo_hash_dword << (16 - n); \
+	if (IXGBE_ATR_COMMON_HASH_KEY & (0x01 << (n + 16))) \
+		common_hash ^= hi_hash_dword >> n; \
+	else if (IXGBE_ATR_BUCKET_HASH_KEY & (0x01 << (n + 16))) \
+		bucket_hash ^= hi_hash_dword >> n; \
+	else if (IXGBE_ATR_SIGNATURE_HASH_KEY & (0x01 << (n + 16))) \
+		sig_hash ^= hi_hash_dword << (16 - n); \
+} while (0);
+
+/**
+ *  ixgbe_atr_compute_sig_hash_82599 - Compute the signature hash
+ *  @stream: input bitstream to compute the hash on
+ *
+ *  This function is almost identical to the function above but contains
+ *  several optomizations such as unwinding all of the loops, letting the
+ *  compiler work out all of the conditional ifs since the keys are static
+ *  defines, and computing two keys at once since the hashed dword stream
+ *  will be the same for both keys.
+ **/
+static u32 ixgbe_atr_compute_sig_hash_82599(union ixgbe_atr_hash_dword input,
+					    union ixgbe_atr_hash_dword common)
+{
+	u32 hi_hash_dword, lo_hash_dword, flow_vm_vlan;
+	u32 sig_hash = 0, bucket_hash = 0, common_hash = 0;
+
+	/* record the flow_vm_vlan bits as they are a key part to the hash */
+	flow_vm_vlan = ntohl(input.dword);
+
+	/* generate common hash dword */
+	hi_hash_dword = ntohl(common.dword);
+
+	/* low dword is word swapped version of common */
+	lo_hash_dword = (hi_hash_dword >> 16) | (hi_hash_dword << 16);
+
+	/* apply flow ID/VM pool/VLAN ID bits to hash words */
+	hi_hash_dword ^= flow_vm_vlan ^ (flow_vm_vlan >> 16);
+
+	/* Process bits 0 and 16 */
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(0);
+
+	/*
+	 * apply flow ID/VM pool/VLAN ID bits to lo hash dword, we had to
+	 * delay this because bit 0 of the stream should not be processed
+	 * so we do not add the vlan until after bit 0 was processed
+	 */
+	lo_hash_dword ^= flow_vm_vlan ^ (flow_vm_vlan << 16);
+
+	/* Process remaining 30 bit of the key */
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(1);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(2);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(3);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(4);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(5);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(6);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(7);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(8);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(9);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(10);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(11);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(12);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(13);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(14);
+	IXGBE_COMPUTE_SIG_HASH_ITERATION(15);
+
+	/* combine common_hash result with signature and bucket hashes */
+	bucket_hash ^= common_hash;
+	bucket_hash &= IXGBE_ATR_HASH_MASK;
+
+	sig_hash ^= common_hash << 16;
+	sig_hash &= IXGBE_ATR_HASH_MASK << 16;
+
+	/* return completed signature hash */
+	return sig_hash ^ bucket_hash;
+}
+
 /**
  *  ixgbe_atr_set_vlan_id_82599 - Sets the VLAN id in the ATR input stream
  *  @input: input stream to modify
@@ -1539,22 +1629,23 @@ static s32 ixgbe_atr_get_l4type_82599(union ixgbe_atr_input *input,
 /**
  *  ixgbe_atr_add_signature_filter_82599 - Adds a signature hash filter
  *  @hw: pointer to hardware structure
- *  @stream: input bitstream
+ *  @input: unique input dword
+ *  @common: compressed common input dword
  *  @queue: queue index to direct traffic to
  **/
 s32 ixgbe_fdir_add_signature_filter_82599(struct ixgbe_hw *hw,
-                                          union ixgbe_atr_input *input,
+                                          union ixgbe_atr_hash_dword input,
+                                          union ixgbe_atr_hash_dword common,
                                           u8 queue)
 {
 	u64  fdirhashcmd;
 	u32  fdircmd;
-	u32  bucket_hash, sig_hash;
 
 	/*
 	 * Get the flow_type in order to program FDIRCMD properly
 	 * lowest 2 bits are FDIRCMD.L4TYPE, third lowest bit is FDIRCMD.IPV6
 	 */
-	switch (input->formatted.flow_type) {
+	switch (input.formatted.flow_type) {
 	case IXGBE_ATR_FLOW_TYPE_TCPV4:
 	case IXGBE_ATR_FLOW_TYPE_UDPV4:
 	case IXGBE_ATR_FLOW_TYPE_SCTPV4:
@@ -1570,7 +1661,7 @@ s32 ixgbe_fdir_add_signature_filter_82599(struct ixgbe_hw *hw,
 	/* configure FDIRCMD register */
 	fdircmd = IXGBE_FDIRCMD_CMD_ADD_FLOW | IXGBE_FDIRCMD_FILTER_UPDATE |
 	          IXGBE_FDIRCMD_LAST | IXGBE_FDIRCMD_QUEUE_EN;
-	fdircmd |= input->formatted.flow_type << IXGBE_FDIRCMD_FLOW_TYPE_SHIFT;
+	fdircmd |= input.formatted.flow_type << IXGBE_FDIRCMD_FLOW_TYPE_SHIFT;
 	fdircmd |= (u32)queue << IXGBE_FDIRCMD_RX_QUEUE_SHIFT;
 
 	/*
@@ -1578,17 +1669,12 @@ s32 ixgbe_fdir_add_signature_filter_82599(struct ixgbe_hw *hw,
 	 * is for FDIRCMD.  Then do a 64-bit register write from FDIRHASH.
 	 */
 	fdirhashcmd = (u64)fdircmd << 32;
-
-	sig_hash = ixgbe_atr_compute_hash_82599(input,
-	                                        IXGBE_ATR_SIGNATURE_HASH_KEY);
-	fdirhashcmd |= sig_hash << IXGBE_FDIRHASH_SIG_SW_INDEX_SHIFT;
-
-	bucket_hash = ixgbe_atr_compute_hash_82599(input,
-	                                           IXGBE_ATR_BUCKET_HASH_KEY);
-	fdirhashcmd |= bucket_hash;
+	fdirhashcmd |= ixgbe_atr_compute_sig_hash_82599(input, common);
 
 	IXGBE_WRITE_REG64(hw, IXGBE_FDIRHASH, fdirhashcmd);
 
+	hw_dbg(hw, "Tx Queue=%x hash=%x\n", queue, (u32)fdirhashcmd);
+
 	return 0;
 }
 
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 26718ab..490818c 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -6506,37 +6506,92 @@ static void ixgbe_tx_queue(struct ixgbe_ring *tx_ring,
 	writel(i, tx_ring->tail);
 }
 
-static void ixgbe_atr(struct ixgbe_adapter *adapter, struct sk_buff *skb,
-		      u8 queue, u32 tx_flags, __be16 protocol)
-{
-	union ixgbe_atr_input atr_input;
-	struct iphdr *iph = ip_hdr(skb);
-	struct ethhdr *eth = (struct ethhdr *)skb->data;
+static void ixgbe_atr(struct ixgbe_ring *ring, struct sk_buff *skb,
+		      u32 tx_flags, __be16 protocol)
+{
+	struct ixgbe_q_vector *q_vector = ring->q_vector;
+	union ixgbe_atr_hash_dword input = { .dword = 0 };
+	union ixgbe_atr_hash_dword common = { .dword = 0 };
+	union {
+		unsigned char *network;
+		struct iphdr *ipv4;
+		struct ipv6hdr *ipv6;
+	} hdr;
 	struct tcphdr *th;
 	__be16 vlan_id;
 
-	/* Right now, we support IPv4 w/ TCP only */
-	if (protocol != htons(ETH_P_IP) ||
-	    iph->protocol != IPPROTO_TCP)
+	/* if ring doesn't have a interrupt vector, cannot perform ATR */
+	if (!q_vector)
+		return;
+
+	/* do nothing if sampling is disabled */
+	if (!ring->atr_sample_rate)
 		return;
 
-	memset(&atr_input, 0, sizeof(union ixgbe_atr_input));
+	ring->atr_count++;
 
-	vlan_id = htons(tx_flags >> IXGBE_TX_FLAGS_VLAN_SHIFT);
+	/* snag network header to get L4 type and address */
+	hdr.network = skb_network_header(skb);
+
+	/* Currently only IPv4/IPv6 with TCP is supported */
+	if ((protocol != __constant_htons(ETH_P_IPV6) ||
+	     hdr.ipv6->nexthdr != IPPROTO_TCP) &&
+	    (protocol != __constant_htons(ETH_P_IP) ||
+	     hdr.ipv4->protocol != IPPROTO_TCP))
+		return;
 
 	th = tcp_hdr(skb);
 
-	ixgbe_atr_set_vlan_id_82599(&atr_input, vlan_id);
-	ixgbe_atr_set_src_port_82599(&atr_input, th->dest);
-	ixgbe_atr_set_dst_port_82599(&atr_input, th->source);
-	ixgbe_atr_set_flex_byte_82599(&atr_input, eth->h_proto);
-	ixgbe_atr_set_l4type_82599(&atr_input, IXGBE_ATR_FLOW_TYPE_TCPV4);
-	/* src and dst are inverted, think how the receiver sees them */
-	ixgbe_atr_set_src_ipv4_82599(&atr_input, iph->daddr);
-	ixgbe_atr_set_dst_ipv4_82599(&atr_input, iph->saddr);
+	/* skip this packet since the socket is closing */
+	if (th->fin)
+		return;
+
+	/* sample on all syn packets or once every atr sample count */
+	if (!th->syn && (ring->atr_count < ring->atr_sample_rate))
+		return;
+
+	/* reset sample count */
+	ring->atr_count = 0;
+
+	vlan_id = htons(tx_flags >> IXGBE_TX_FLAGS_VLAN_SHIFT);
+
+	/*
+	 * src and dst are inverted, think how the receiver sees them
+	 *
+	 * The input is broken into two sections, a non-compressed section
+	 * containing vm_pool, vlan_id, and flow_type.  The rest of the data
+	 * is XORed together and stored in the compressed dword.
+	 */
+	input.formatted.vlan_id = vlan_id;
+
+	/*
+	 * since src port and flex bytes occupy the same word XOR them together
+	 * and write the value to source port portion of compressed dword
+	 */
+	if (vlan_id)
+		common.port.src ^= th->dest ^ __constant_htons(ETH_P_8021Q);
+	else
+		common.port.src ^= th->dest ^ protocol;
+	common.port.dst ^= th->source;
+
+	if (protocol == __constant_htons(ETH_P_IP)) {
+		input.formatted.flow_type = IXGBE_ATR_FLOW_TYPE_TCPV4;
+		common.ip ^= hdr.ipv4->saddr ^ hdr.ipv4->daddr;
+	} else {
+		input.formatted.flow_type = IXGBE_ATR_FLOW_TYPE_TCPV6;
+		common.ip ^= hdr.ipv6->saddr.s6_addr32[0] ^
+			     hdr.ipv6->saddr.s6_addr32[1] ^
+			     hdr.ipv6->saddr.s6_addr32[2] ^
+			     hdr.ipv6->saddr.s6_addr32[3] ^
+			     hdr.ipv6->daddr.s6_addr32[0] ^
+			     hdr.ipv6->daddr.s6_addr32[1] ^
+			     hdr.ipv6->daddr.s6_addr32[2] ^
+			     hdr.ipv6->daddr.s6_addr32[3];
+	}
 
 	/* This assumes the Rx queue and Tx queue are bound to the same CPU */
-	ixgbe_fdir_add_signature_filter_82599(&adapter->hw, &atr_input, queue);
+	ixgbe_fdir_add_signature_filter_82599(&q_vector->adapter->hw,
+					      input, common, ring->queue_index);
 }
 
 static int __ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, int size)
@@ -6707,16 +6762,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	count = ixgbe_tx_map(adapter, tx_ring, skb, tx_flags, first, hdr_len);
 	if (count) {
 		/* add the ATR filter if ATR is on */
-		if (tx_ring->atr_sample_rate) {
-			++tx_ring->atr_count;
-			if ((tx_ring->atr_count >= tx_ring->atr_sample_rate) &&
-			     test_bit(__IXGBE_TX_FDIR_INIT_DONE,
-				      &tx_ring->state)) {
-				ixgbe_atr(adapter, skb, tx_ring->queue_index,
-					  tx_flags, protocol);
-				tx_ring->atr_count = 0;
-			}
-		}
+		if (test_bit(__IXGBE_TX_FDIR_INIT_DONE, &tx_ring->state))
+			ixgbe_atr(tx_ring, skb, tx_flags, protocol);
 		txq = netdev_get_tx_queue(netdev, tx_ring->queue_index);
 		txq->tx_bytes += skb->len;
 		txq->tx_packets++;
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index c56a712..0d9392d 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -2198,6 +2198,22 @@ union ixgbe_atr_input {
 	__be32 dword_stream[11];
 };
 
+/* Flow Director compressed ATR hash input struct */
+union ixgbe_atr_hash_dword {
+	struct {
+		u8 vm_pool;
+		u8 flow_type;
+		__be16 vlan_id;
+	} formatted;
+	__be32 ip;
+	struct {
+		__be16 src;
+		__be16 dst;
+	} port;
+	__be16 flex_bytes;
+	__be32 dword;
+};
+
 struct ixgbe_atr_input_masks {
 	__be32 src_ip_mask;
 	__be32 dst_ip_mask;
-- 
1.7.3.4


^ permalink raw reply related

* [net-next 12/12] ixgbe: update ntuple filter configuration
From: jeffrey.t.kirsher @ 2011-01-07  0:29 UTC (permalink / raw)
  To: davem, davem; +Cc: Alexander Duyck, netdev, gosp, bphilips, Jeff Kirsher
In-Reply-To: <1294360199-9860-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change fixes several issues found in ntuple filtering while I was
doing the ATR refactor.

Specifically I updated the masks to work correctly with the latest version
of ethtool, I cleaned up the exception handling and added detailed error
output when a filter is rejected, and corrected several bits that were set
incorrectly in ixgbe_type.h.

The previous version of this patch included a printk that was left over from
me fixing the filter setup.  This patch does not include that printk.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Stephen Ko <stephen.s.ko@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe.h         |   14 --
 drivers/net/ixgbe/ixgbe_82599.c   |  436 ++++++++++++-------------------------
 drivers/net/ixgbe/ixgbe_ethtool.c |  134 ++++++++----
 drivers/net/ixgbe/ixgbe_main.c    |   21 +-
 drivers/net/ixgbe/ixgbe_type.h    |   16 +-
 5 files changed, 250 insertions(+), 371 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 341b3db..3b8c924 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -533,20 +533,6 @@ extern s32 ixgbe_fdir_add_perfect_filter_82599(struct ixgbe_hw *hw,
                                       union ixgbe_atr_input *input,
                                       struct ixgbe_atr_input_masks *input_masks,
                                       u16 soft_id, u8 queue);
-extern s32 ixgbe_atr_set_vlan_id_82599(union ixgbe_atr_input *input,
-                                       u16 vlan_id);
-extern s32 ixgbe_atr_set_src_ipv4_82599(union ixgbe_atr_input *input,
-                                        u32 src_addr);
-extern s32 ixgbe_atr_set_dst_ipv4_82599(union ixgbe_atr_input *input,
-                                        u32 dst_addr);
-extern s32 ixgbe_atr_set_src_port_82599(union ixgbe_atr_input *input,
-                                        u16 src_port);
-extern s32 ixgbe_atr_set_dst_port_82599(union ixgbe_atr_input *input,
-                                        u16 dst_port);
-extern s32 ixgbe_atr_set_flex_byte_82599(union ixgbe_atr_input *input,
-                                         u16 flex_byte);
-extern s32 ixgbe_atr_set_l4type_82599(union ixgbe_atr_input *input,
-                                      u8 l4type);
 extern void ixgbe_configure_rscctl(struct ixgbe_adapter *adapter,
                                    struct ixgbe_ring *ring);
 extern void ixgbe_clear_rscctl(struct ixgbe_adapter *adapter,
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index d41931f..8d316d9 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -1422,211 +1422,6 @@ static u32 ixgbe_atr_compute_sig_hash_82599(union ixgbe_atr_hash_dword input,
 }
 
 /**
- *  ixgbe_atr_set_vlan_id_82599 - Sets the VLAN id in the ATR input stream
- *  @input: input stream to modify
- *  @vlan: the VLAN id to load
- **/
-s32 ixgbe_atr_set_vlan_id_82599(union ixgbe_atr_input *input, __be16 vlan)
-{
-	input->formatted.vlan_id = vlan;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_set_src_ipv4_82599 - Sets the source IPv4 address
- *  @input: input stream to modify
- *  @src_addr: the IP address to load
- **/
-s32 ixgbe_atr_set_src_ipv4_82599(union ixgbe_atr_input *input, __be32 src_addr)
-{
-	input->formatted.src_ip[0] = src_addr;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_set_dst_ipv4_82599 - Sets the destination IPv4 address
- *  @input: input stream to modify
- *  @dst_addr: the IP address to load
- **/
-s32 ixgbe_atr_set_dst_ipv4_82599(union ixgbe_atr_input *input, __be32 dst_addr)
-{
-	input->formatted.dst_ip[0] = dst_addr;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_set_src_port_82599 - Sets the source port
- *  @input: input stream to modify
- *  @src_port: the source port to load
- **/
-s32 ixgbe_atr_set_src_port_82599(union ixgbe_atr_input *input, __be16 src_port)
-{
-	input->formatted.src_port = src_port;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_set_dst_port_82599 - Sets the destination port
- *  @input: input stream to modify
- *  @dst_port: the destination port to load
- **/
-s32 ixgbe_atr_set_dst_port_82599(union ixgbe_atr_input *input, __be16 dst_port)
-{
-	input->formatted.dst_port = dst_port;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_set_flex_byte_82599 - Sets the flexible bytes
- *  @input: input stream to modify
- *  @flex_bytes: the flexible bytes to load
- **/
-s32 ixgbe_atr_set_flex_byte_82599(union ixgbe_atr_input *input,
-				  __be16 flex_bytes)
-{
-	input->formatted.flex_bytes = flex_bytes;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_set_l4type_82599 - Sets the layer 4 packet type
- *  @input: input stream to modify
- *  @l4type: the layer 4 type value to load
- **/
-s32 ixgbe_atr_set_l4type_82599(union ixgbe_atr_input *input, u8 l4type)
-{
-	input->formatted.flow_type = l4type;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_vlan_id_82599 - Gets the VLAN id from the ATR input stream
- *  @input: input stream to search
- *  @vlan: the VLAN id to load
- **/
-static s32 ixgbe_atr_get_vlan_id_82599(union ixgbe_atr_input *input, __be16 *vlan)
-{
-	*vlan = input->formatted.vlan_id;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_src_ipv4_82599 - Gets the source IPv4 address
- *  @input: input stream to search
- *  @src_addr: the IP address to load
- **/
-static s32 ixgbe_atr_get_src_ipv4_82599(union ixgbe_atr_input *input,
-                                        __be32 *src_addr)
-{
-	*src_addr = input->formatted.src_ip[0];
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_dst_ipv4_82599 - Gets the destination IPv4 address
- *  @input: input stream to search
- *  @dst_addr: the IP address to load
- **/
-static s32 ixgbe_atr_get_dst_ipv4_82599(union ixgbe_atr_input *input,
-                                        __be32 *dst_addr)
-{
-	*dst_addr = input->formatted.dst_ip[0];
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_src_ipv6_82599 - Gets the source IPv6 address
- *  @input: input stream to search
- *  @src_addr_1: the first 4 bytes of the IP address to load
- *  @src_addr_2: the second 4 bytes of the IP address to load
- *  @src_addr_3: the third 4 bytes of the IP address to load
- *  @src_addr_4: the fourth 4 bytes of the IP address to load
- **/
-static s32 ixgbe_atr_get_src_ipv6_82599(union ixgbe_atr_input *input,
-                                        __be32 *src_addr_0, __be32 *src_addr_1,
-                                        __be32 *src_addr_2, __be32 *src_addr_3)
-{
-	*src_addr_0 = input->formatted.src_ip[0];
-	*src_addr_1 = input->formatted.src_ip[1];
-	*src_addr_2 = input->formatted.src_ip[2];
-	*src_addr_3 = input->formatted.src_ip[3];
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_src_port_82599 - Gets the source port
- *  @input: input stream to modify
- *  @src_port: the source port to load
- *
- *  Even though the input is given in big-endian, the FDIRPORT registers
- *  expect the ports to be programmed in little-endian.  Hence the need to swap
- *  endianness when retrieving the data.  This can be confusing since the
- *  internal hash engine expects it to be big-endian.
- **/
-static s32 ixgbe_atr_get_src_port_82599(union ixgbe_atr_input *input,
-                                        __be16 *src_port)
-{
-	*src_port = input->formatted.src_port;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_dst_port_82599 - Gets the destination port
- *  @input: input stream to modify
- *  @dst_port: the destination port to load
- *
- *  Even though the input is given in big-endian, the FDIRPORT registers
- *  expect the ports to be programmed in little-endian.  Hence the need to swap
- *  endianness when retrieving the data.  This can be confusing since the
- *  internal hash engine expects it to be big-endian.
- **/
-static s32 ixgbe_atr_get_dst_port_82599(union ixgbe_atr_input *input,
-                                        __be16 *dst_port)
-{
-	*dst_port = input->formatted.dst_port;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_flex_byte_82599 - Gets the flexible bytes
- *  @input: input stream to modify
- *  @flex_bytes: the flexible bytes to load
- **/
-static s32 ixgbe_atr_get_flex_byte_82599(union ixgbe_atr_input *input,
-                                         __be16 *flex_bytes)
-{
-	*flex_bytes = input->formatted.flex_bytes;
-
-	return 0;
-}
-
-/**
- *  ixgbe_atr_get_l4type_82599 - Gets the layer 4 packet type
- *  @input: input stream to modify
- *  @l4type: the layer 4 type value to load
- **/
-static s32 ixgbe_atr_get_l4type_82599(union ixgbe_atr_input *input,
-                                      u8 *l4type)
-{
-	*l4type = input->formatted.flow_type;
-
-	return 0;
-}
-
-/**
  *  ixgbe_atr_add_signature_filter_82599 - Adds a signature hash filter
  *  @hw: pointer to hardware structure
  *  @input: unique input dword
@@ -1679,6 +1474,43 @@ s32 ixgbe_fdir_add_signature_filter_82599(struct ixgbe_hw *hw,
 }
 
 /**
+ *  ixgbe_get_fdirtcpm_82599 - generate a tcp port from atr_input_masks
+ *  @input_mask: mask to be bit swapped
+ *
+ *  The source and destination port masks for flow director are bit swapped
+ *  in that bit 15 effects bit 0, 14 effects 1, 13, 2 etc.  In order to
+ *  generate a correctly swapped value we need to bit swap the mask and that
+ *  is what is accomplished by this function.
+ **/
+static u32 ixgbe_get_fdirtcpm_82599(struct ixgbe_atr_input_masks *input_masks)
+{
+	u32 mask = ntohs(input_masks->dst_port_mask);
+	mask <<= IXGBE_FDIRTCPM_DPORTM_SHIFT;
+	mask |= ntohs(input_masks->src_port_mask);
+	mask = ((mask & 0x55555555) << 1) | ((mask & 0xAAAAAAAA) >> 1);
+	mask = ((mask & 0x33333333) << 2) | ((mask & 0xCCCCCCCC) >> 2);
+	mask = ((mask & 0x0F0F0F0F) << 4) | ((mask & 0xF0F0F0F0) >> 4);
+	return ((mask & 0x00FF00FF) << 8) | ((mask & 0xFF00FF00) >> 8);
+}
+
+/*
+ * These two macros are meant to address the fact that we have registers
+ * that are either all or in part big-endian.  As a result on big-endian
+ * systems we will end up byte swapping the value to little-endian before
+ * it is byte swapped again and written to the hardware in the original
+ * big-endian format.
+ */
+#define IXGBE_STORE_AS_BE32(_value) \
+	(((u32)(_value) >> 24) | (((u32)(_value) & 0x00FF0000) >> 8) | \
+	 (((u32)(_value) & 0x0000FF00) << 8) | ((u32)(_value) << 24))
+
+#define IXGBE_WRITE_REG_BE32(a, reg, value) \
+	IXGBE_WRITE_REG((a), (reg), IXGBE_STORE_AS_BE32(ntohl(value)))
+
+#define IXGBE_STORE_AS_BE16(_value) \
+	(((u16)(_value) >> 8) | ((u16)(_value) << 8))
+
+/**
  *  ixgbe_fdir_add_perfect_filter_82599 - Adds a perfect filter
  *  @hw: pointer to hardware structure
  *  @input: input bitstream
@@ -1694,131 +1526,135 @@ s32 ixgbe_fdir_add_perfect_filter_82599(struct ixgbe_hw *hw,
                                       struct ixgbe_atr_input_masks *input_masks,
                                       u16 soft_id, u8 queue)
 {
-	u32 fdircmd = 0;
 	u32 fdirhash;
-	u32 src_ipv4 = 0, dst_ipv4 = 0;
-	u32 src_ipv6_1, src_ipv6_2, src_ipv6_3, src_ipv6_4;
-	u16 src_port, dst_port, vlan_id, flex_bytes;
-	u16 bucket_hash;
-	u8  l4type;
-	u8  fdirm = 0;
-
-	/* Get our input values */
-	ixgbe_atr_get_l4type_82599(input, &l4type);
+	u32 fdircmd;
+	u32 fdirport, fdirtcpm;
+	u32 fdirvlan;
+	/* start with VLAN, flex bytes, VM pool, and IPv6 destination masked */
+	u32 fdirm = IXGBE_FDIRM_VLANID | IXGBE_FDIRM_VLANP | IXGBE_FDIRM_FLEX |
+		    IXGBE_FDIRM_POOL | IXGBE_FDIRM_DIPv6;
 
 	/*
-	 * Check l4type formatting, and bail out before we touch the hardware
+	 * Check flow_type formatting, and bail out before we touch the hardware
 	 * if there's a configuration issue
 	 */
-	switch (l4type & IXGBE_ATR_L4TYPE_MASK) {
-	case IXGBE_ATR_L4TYPE_TCP:
-		fdircmd |= IXGBE_FDIRCMD_L4TYPE_TCP;
-		break;
-	case IXGBE_ATR_L4TYPE_UDP:
-		fdircmd |= IXGBE_FDIRCMD_L4TYPE_UDP;
-		break;
-	case IXGBE_ATR_L4TYPE_SCTP:
-		fdircmd |= IXGBE_FDIRCMD_L4TYPE_SCTP;
+	switch (input->formatted.flow_type) {
+	case IXGBE_ATR_FLOW_TYPE_IPV4:
+		/* use the L4 protocol mask for raw IPv4/IPv6 traffic */
+		fdirm |= IXGBE_FDIRM_L4P;
+	case IXGBE_ATR_FLOW_TYPE_SCTPV4:
+		if (input_masks->dst_port_mask || input_masks->src_port_mask) {
+			hw_dbg(hw, " Error on src/dst port mask\n");
+			return IXGBE_ERR_CONFIG;
+		}
+	case IXGBE_ATR_FLOW_TYPE_TCPV4:
+	case IXGBE_ATR_FLOW_TYPE_UDPV4:
 		break;
 	default:
-		hw_dbg(hw, "Error on l4type input\n");
+		hw_dbg(hw, " Error on flow type input\n");
 		return IXGBE_ERR_CONFIG;
 	}
 
-	bucket_hash = ixgbe_atr_compute_hash_82599(input,
-	                                           IXGBE_ATR_BUCKET_HASH_KEY);
-
-	/* bucket_hash is only 15 bits */
-	bucket_hash &= IXGBE_ATR_HASH_MASK;
-
-	ixgbe_atr_get_vlan_id_82599(input, &vlan_id);
-	ixgbe_atr_get_src_port_82599(input, &src_port);
-	ixgbe_atr_get_dst_port_82599(input, &dst_port);
-	ixgbe_atr_get_flex_byte_82599(input, &flex_bytes);
-
-	fdirhash = soft_id << IXGBE_FDIRHASH_SIG_SW_INDEX_SHIFT | bucket_hash;
-
-	/* Now figure out if we're IPv4 or IPv6 */
-	if (l4type & IXGBE_ATR_L4TYPE_IPV6_MASK) {
-		/* IPv6 */
-		ixgbe_atr_get_src_ipv6_82599(input, &src_ipv6_1, &src_ipv6_2,
-	                                     &src_ipv6_3, &src_ipv6_4);
-
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRSIPv6(0), src_ipv6_1);
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRSIPv6(1), src_ipv6_2);
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRSIPv6(2), src_ipv6_3);
-		/* The last 4 bytes is the same register as IPv4 */
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRIPSA, src_ipv6_4);
-
-		fdircmd |= IXGBE_FDIRCMD_IPV6;
-		fdircmd |= IXGBE_FDIRCMD_IPv6DMATCH;
-	} else {
-		/* IPv4 */
-		ixgbe_atr_get_src_ipv4_82599(input, &src_ipv4);
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRIPSA, src_ipv4);
-	}
-
-	ixgbe_atr_get_dst_ipv4_82599(input, &dst_ipv4);
-	IXGBE_WRITE_REG(hw, IXGBE_FDIRIPDA, dst_ipv4);
-
-	IXGBE_WRITE_REG(hw, IXGBE_FDIRVLAN, (vlan_id |
-	                            (flex_bytes << IXGBE_FDIRVLAN_FLEX_SHIFT)));
-	IXGBE_WRITE_REG(hw, IXGBE_FDIRPORT, (src_port |
-	              (dst_port << IXGBE_FDIRPORT_DESTINATION_SHIFT)));
-
 	/*
-	 * Program the relevant mask registers.  L4type cannot be
-	 * masked out in this implementation.
+	 * Program the relevant mask registers.  If src/dst_port or src/dst_addr
+	 * are zero, then assume a full mask for that field.  Also assume that
+	 * a VLAN of 0 is unspecified, so mask that out as well.  L4type
+	 * cannot be masked out in this implementation.
 	 *
 	 * This also assumes IPv4 only.  IPv6 masking isn't supported at this
 	 * point in time.
 	 */
-	IXGBE_WRITE_REG(hw, IXGBE_FDIRSIP4M, input_masks->src_ip_mask);
-	IXGBE_WRITE_REG(hw, IXGBE_FDIRDIP4M, input_masks->dst_ip_mask);
-
-	switch (l4type & IXGBE_ATR_L4TYPE_MASK) {
-	case IXGBE_ATR_L4TYPE_TCP:
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRTCPM, input_masks->src_port_mask);
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRTCPM,
-				(IXGBE_READ_REG(hw, IXGBE_FDIRTCPM) |
-				 (input_masks->dst_port_mask << 16)));
+
+	/* Program FDIRM */
+	switch (ntohs(input_masks->vlan_id_mask) & 0xEFFF) {
+	case 0xEFFF:
+		/* Unmask VLAN ID - bit 0 and fall through to unmask prio */
+		fdirm &= ~IXGBE_FDIRM_VLANID;
+	case 0xE000:
+		/* Unmask VLAN prio - bit 1 */
+		fdirm &= ~IXGBE_FDIRM_VLANP;
 		break;
-	case IXGBE_ATR_L4TYPE_UDP:
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRUDPM, input_masks->src_port_mask);
-		IXGBE_WRITE_REG(hw, IXGBE_FDIRUDPM,
-				(IXGBE_READ_REG(hw, IXGBE_FDIRUDPM) |
-				 (input_masks->src_port_mask << 16)));
+	case 0x0FFF:
+		/* Unmask VLAN ID - bit 0 */
+		fdirm &= ~IXGBE_FDIRM_VLANID;
 		break;
-	default:
-		/* this already would have failed above */
+	case 0x0000:
+		/* do nothing, vlans already masked */
 		break;
+	default:
+		hw_dbg(hw, " Error on VLAN mask\n");
+		return IXGBE_ERR_CONFIG;
 	}
 
-	/* Program the last mask register, FDIRM */
-	if (input_masks->vlan_id_mask)
-		/* Mask both VLAN and VLANP - bits 0 and 1 */
-		fdirm |= 0x3;
-
-	if (input_masks->data_mask)
-		/* Flex bytes need masking, so mask the whole thing - bit 4 */
-		fdirm |= 0x10;
+	if (input_masks->flex_mask & 0xFFFF) {
+		if ((input_masks->flex_mask & 0xFFFF) != 0xFFFF) {
+			hw_dbg(hw, " Error on flexible byte mask\n");
+			return IXGBE_ERR_CONFIG;
+		}
+		/* Unmask Flex Bytes - bit 4 */
+		fdirm &= ~IXGBE_FDIRM_FLEX;
+	}
 
 	/* Now mask VM pool and destination IPv6 - bits 5 and 2 */
-	fdirm |= 0x24;
-
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRM, fdirm);
 
-	fdircmd |= IXGBE_FDIRCMD_CMD_ADD_FLOW;
-	fdircmd |= IXGBE_FDIRCMD_FILTER_UPDATE;
-	fdircmd |= IXGBE_FDIRCMD_LAST;
-	fdircmd |= IXGBE_FDIRCMD_QUEUE_EN;
-	fdircmd |= queue << IXGBE_FDIRCMD_RX_QUEUE_SHIFT;
+	/* store the TCP/UDP port masks, bit reversed from port layout */
+	fdirtcpm = ixgbe_get_fdirtcpm_82599(input_masks);
+
+	/* write both the same so that UDP and TCP use the same mask */
+	IXGBE_WRITE_REG(hw, IXGBE_FDIRTCPM, ~fdirtcpm);
+	IXGBE_WRITE_REG(hw, IXGBE_FDIRUDPM, ~fdirtcpm);
+
+	/* store source and destination IP masks (big-enian) */
+	IXGBE_WRITE_REG_BE32(hw, IXGBE_FDIRSIP4M,
+			     ~input_masks->src_ip_mask[0]);
+	IXGBE_WRITE_REG_BE32(hw, IXGBE_FDIRDIP4M,
+			     ~input_masks->dst_ip_mask[0]);
+
+	/* Apply masks to input data */
+	input->formatted.vlan_id &= input_masks->vlan_id_mask;
+	input->formatted.flex_bytes &= input_masks->flex_mask;
+	input->formatted.src_port &= input_masks->src_port_mask;
+	input->formatted.dst_port &= input_masks->dst_port_mask;
+	input->formatted.src_ip[0] &= input_masks->src_ip_mask[0];
+	input->formatted.dst_ip[0] &= input_masks->dst_ip_mask[0];
+
+	/* record vlan (little-endian) and flex_bytes(big-endian) */
+	fdirvlan =
+		IXGBE_STORE_AS_BE16(ntohs(input->formatted.flex_bytes));
+	fdirvlan <<= IXGBE_FDIRVLAN_FLEX_SHIFT;
+	fdirvlan |= ntohs(input->formatted.vlan_id);
+	IXGBE_WRITE_REG(hw, IXGBE_FDIRVLAN, fdirvlan);
+
+	/* record source and destination port (little-endian)*/
+	fdirport = ntohs(input->formatted.dst_port);
+	fdirport <<= IXGBE_FDIRPORT_DESTINATION_SHIFT;
+	fdirport |= ntohs(input->formatted.src_port);
+	IXGBE_WRITE_REG(hw, IXGBE_FDIRPORT, fdirport);
+
+	/* record the first 32 bits of the destination address (big-endian) */
+	IXGBE_WRITE_REG_BE32(hw, IXGBE_FDIRIPDA, input->formatted.dst_ip[0]);
+
+	/* record the source address (big-endian) */
+	IXGBE_WRITE_REG_BE32(hw, IXGBE_FDIRIPSA, input->formatted.src_ip[0]);
+
+	/* configure FDIRCMD register */
+	fdircmd = IXGBE_FDIRCMD_CMD_ADD_FLOW | IXGBE_FDIRCMD_FILTER_UPDATE |
+		  IXGBE_FDIRCMD_LAST | IXGBE_FDIRCMD_QUEUE_EN;
+	fdircmd |= input->formatted.flow_type << IXGBE_FDIRCMD_FLOW_TYPE_SHIFT;
+	fdircmd |= (u32)queue << IXGBE_FDIRCMD_RX_QUEUE_SHIFT;
+
+	/* we only want the bucket hash so drop the upper 16 bits */
+	fdirhash = ixgbe_atr_compute_hash_82599(input,
+						IXGBE_ATR_BUCKET_HASH_KEY);
+	fdirhash |= soft_id << IXGBE_FDIRHASH_SIG_SW_INDEX_SHIFT;
 
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRHASH, fdirhash);
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRCMD, fdircmd);
 
 	return 0;
 }
+
 /**
  *  ixgbe_read_analog_reg8_82599 - Reads 8 bit Omer analog register
  *  @hw: pointer to hardware structure
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 76e40e2..2002ea8 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -2277,10 +2277,11 @@ static int ixgbe_set_rx_ntuple(struct net_device *dev,
                                struct ethtool_rx_ntuple *cmd)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
-	struct ethtool_rx_ntuple_flow_spec fs = cmd->fs;
+	struct ethtool_rx_ntuple_flow_spec *fs = &cmd->fs;
 	union ixgbe_atr_input input_struct;
 	struct ixgbe_atr_input_masks input_masks;
 	int target_queue;
+	int err;
 
 	if (adapter->hw.mac.type == ixgbe_mac_82598EB)
 		return -EOPNOTSUPP;
@@ -2289,67 +2290,122 @@ static int ixgbe_set_rx_ntuple(struct net_device *dev,
 	 * Don't allow programming if the action is a queue greater than
 	 * the number of online Tx queues.
 	 */
-	if ((fs.action >= adapter->num_tx_queues) ||
-	    (fs.action < ETHTOOL_RXNTUPLE_ACTION_DROP))
+	if ((fs->action >= adapter->num_tx_queues) ||
+	    (fs->action < ETHTOOL_RXNTUPLE_ACTION_DROP))
 		return -EINVAL;
 
 	memset(&input_struct, 0, sizeof(union ixgbe_atr_input));
 	memset(&input_masks, 0, sizeof(struct ixgbe_atr_input_masks));
 
-	input_masks.src_ip_mask = fs.m_u.tcp_ip4_spec.ip4src;
-	input_masks.dst_ip_mask = fs.m_u.tcp_ip4_spec.ip4dst;
-	input_masks.src_port_mask = fs.m_u.tcp_ip4_spec.psrc;
-	input_masks.dst_port_mask = fs.m_u.tcp_ip4_spec.pdst;
-	input_masks.vlan_id_mask = fs.vlan_tag_mask;
-	/* only use the lowest 2 bytes for flex bytes */
-	input_masks.data_mask = (fs.data_mask & 0xffff);
-
-	switch (fs.flow_type) {
+	/* record flow type */
+	switch (fs->flow_type) {
+	case IPV4_FLOW:
+		input_struct.formatted.flow_type = IXGBE_ATR_FLOW_TYPE_IPV4;
+		break;
 	case TCP_V4_FLOW:
-		ixgbe_atr_set_l4type_82599(&input_struct, IXGBE_ATR_L4TYPE_TCP);
+		input_struct.formatted.flow_type = IXGBE_ATR_FLOW_TYPE_TCPV4;
 		break;
 	case UDP_V4_FLOW:
-		ixgbe_atr_set_l4type_82599(&input_struct, IXGBE_ATR_L4TYPE_UDP);
+		input_struct.formatted.flow_type = IXGBE_ATR_FLOW_TYPE_UDPV4;
 		break;
 	case SCTP_V4_FLOW:
-		ixgbe_atr_set_l4type_82599(&input_struct, IXGBE_ATR_L4TYPE_SCTP);
+		input_struct.formatted.flow_type = IXGBE_ATR_FLOW_TYPE_SCTPV4;
 		break;
 	default:
 		return -1;
 	}
 
-	/* Mask bits from the inputs based on user-supplied mask */
-	ixgbe_atr_set_src_ipv4_82599(&input_struct,
-	            (fs.h_u.tcp_ip4_spec.ip4src & ~fs.m_u.tcp_ip4_spec.ip4src));
-	ixgbe_atr_set_dst_ipv4_82599(&input_struct,
-	            (fs.h_u.tcp_ip4_spec.ip4dst & ~fs.m_u.tcp_ip4_spec.ip4dst));
-	/* 82599 expects these to be byte-swapped for perfect filtering */
-	ixgbe_atr_set_src_port_82599(&input_struct,
-	       ((ntohs(fs.h_u.tcp_ip4_spec.psrc)) & ~fs.m_u.tcp_ip4_spec.psrc));
-	ixgbe_atr_set_dst_port_82599(&input_struct,
-	       ((ntohs(fs.h_u.tcp_ip4_spec.pdst)) & ~fs.m_u.tcp_ip4_spec.pdst));
-
-	/* VLAN and Flex bytes are either completely masked or not */
-	if (!fs.vlan_tag_mask)
-		ixgbe_atr_set_vlan_id_82599(&input_struct, fs.vlan_tag);
-
-	if (!input_masks.data_mask)
-		/* make sure we only use the first 2 bytes of user data */
-		ixgbe_atr_set_flex_byte_82599(&input_struct,
-		                              (fs.data & 0xffff));
+	/* copy vlan tag minus the CFI bit */
+	if ((fs->vlan_tag & 0xEFFF) || (~fs->vlan_tag_mask & 0xEFFF)) {
+		input_struct.formatted.vlan_id = htons(fs->vlan_tag & 0xEFFF);
+		if (!fs->vlan_tag_mask) {
+			input_masks.vlan_id_mask = htons(0xEFFF);
+		} else {
+			switch (~fs->vlan_tag_mask & 0xEFFF) {
+			/* all of these are valid vlan-mask values */
+			case 0xEFFF:
+			case 0xE000:
+			case 0x0FFF:
+			case 0x0000:
+				input_masks.vlan_id_mask =
+					htons(~fs->vlan_tag_mask);
+				break;
+			/* exit with error if vlan-mask is invalid */
+			default:
+				e_err(drv, "Partial VLAN ID or "
+				      "priority mask in vlan-mask is not "
+				      "supported by hardware\n");
+				return -1;
+			}
+		}
+	}
+
+	/* make sure we only use the first 2 bytes of user data */
+	if ((fs->data & 0xFFFF) || (~fs->data_mask & 0xFFFF)) {
+		input_struct.formatted.flex_bytes = htons(fs->data & 0xFFFF);
+		if (!(fs->data_mask & 0xFFFF)) {
+			input_masks.flex_mask = 0xFFFF;
+		} else if (~fs->data_mask & 0xFFFF) {
+			e_err(drv, "Partial user-def-mask is not "
+			      "supported by hardware\n");
+			return -1;
+		}
+	}
+
+	/*
+	 * Copy input into formatted structures
+	 *
+	 * These assignments are based on the following logic
+	 * If neither input or mask are set assume value is masked out.
+	 * If input is set, but mask is not mask should default to accept all.
+	 * If input is not set, but mask is set then mask likely results in 0.
+	 * If input is set and mask is set then assign both.
+	 */
+	if (fs->h_u.tcp_ip4_spec.ip4src || ~fs->m_u.tcp_ip4_spec.ip4src) {
+		input_struct.formatted.src_ip[0] = fs->h_u.tcp_ip4_spec.ip4src;
+		if (!fs->m_u.tcp_ip4_spec.ip4src)
+			input_masks.src_ip_mask[0] = 0xFFFFFFFF;
+		else
+			input_masks.src_ip_mask[0] =
+				~fs->m_u.tcp_ip4_spec.ip4src;
+	}
+	if (fs->h_u.tcp_ip4_spec.ip4dst || ~fs->m_u.tcp_ip4_spec.ip4dst) {
+		input_struct.formatted.dst_ip[0] = fs->h_u.tcp_ip4_spec.ip4dst;
+		if (!fs->m_u.tcp_ip4_spec.ip4dst)
+			input_masks.dst_ip_mask[0] = 0xFFFFFFFF;
+		else
+			input_masks.dst_ip_mask[0] =
+				~fs->m_u.tcp_ip4_spec.ip4dst;
+	}
+	if (fs->h_u.tcp_ip4_spec.psrc || ~fs->m_u.tcp_ip4_spec.psrc) {
+		input_struct.formatted.src_port = fs->h_u.tcp_ip4_spec.psrc;
+		if (!fs->m_u.tcp_ip4_spec.psrc)
+			input_masks.src_port_mask = 0xFFFF;
+		else
+			input_masks.src_port_mask = ~fs->m_u.tcp_ip4_spec.psrc;
+	}
+	if (fs->h_u.tcp_ip4_spec.pdst || ~fs->m_u.tcp_ip4_spec.pdst) {
+		input_struct.formatted.dst_port = fs->h_u.tcp_ip4_spec.pdst;
+		if (!fs->m_u.tcp_ip4_spec.pdst)
+			input_masks.dst_port_mask = 0xFFFF;
+		else
+			input_masks.dst_port_mask = ~fs->m_u.tcp_ip4_spec.pdst;
+	}
 
 	/* determine if we need to drop or route the packet */
-	if (fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP)
+	if (fs->action == ETHTOOL_RXNTUPLE_ACTION_DROP)
 		target_queue = MAX_RX_QUEUES - 1;
 	else
-		target_queue = fs.action;
+		target_queue = fs->action;
 
 	spin_lock(&adapter->fdir_perfect_lock);
-	ixgbe_fdir_add_perfect_filter_82599(&adapter->hw, &input_struct,
-	                                    &input_masks, 0, target_queue);
+	err = ixgbe_fdir_add_perfect_filter_82599(&adapter->hw,
+						  &input_struct,
+						  &input_masks, 0,
+						  target_queue);
 	spin_unlock(&adapter->fdir_perfect_lock);
 
-	return 0;
+	return err ? -1 : 0;
 }
 
 static const struct ethtool_ops ixgbe_ethtool_ops = {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 490818c..a060610 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4821,6 +4821,12 @@ static int ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
 
 	adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;
 	adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
+	if (adapter->flags & (IXGBE_FLAG_FDIR_HASH_CAPABLE |
+			      IXGBE_FLAG_FDIR_PERFECT_CAPABLE)) {
+		e_err(probe,
+		      "Flow Director is not supported while multiple "
+		      "queues are disabled.  Disabling Flow Director\n");
+	}
 	adapter->flags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
 	adapter->flags &= ~IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
 	adapter->atr_sample_rate = 0;
@@ -5126,16 +5132,11 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 		adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED;
 		if (hw->device_id == IXGBE_DEV_ID_82599_T3_LOM)
 			adapter->flags2 |= IXGBE_FLAG2_TEMP_SENSOR_CAPABLE;
-		if (dev->features & NETIF_F_NTUPLE) {
-			/* Flow Director perfect filter enabled */
-			adapter->flags |= IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
-			adapter->atr_sample_rate = 0;
-			spin_lock_init(&adapter->fdir_perfect_lock);
-		} else {
-			/* Flow Director hash filters enabled */
-			adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE;
-			adapter->atr_sample_rate = 20;
-		}
+		/* n-tuple support exists, always init our spinlock */
+		spin_lock_init(&adapter->fdir_perfect_lock);
+		/* Flow Director hash filters enabled */
+		adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE;
+		adapter->atr_sample_rate = 20;
 		adapter->ring_feature[RING_F_FDIR].indices =
 							 IXGBE_MAX_FDIR_INDICES;
 		adapter->fdir_pballoc = 0;
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 0d9392d..fd3358f 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -1947,10 +1947,9 @@ enum ixgbe_fdir_pballoc_type {
 #define IXGBE_FDIRM_VLANID                      0x00000001
 #define IXGBE_FDIRM_VLANP                       0x00000002
 #define IXGBE_FDIRM_POOL                        0x00000004
-#define IXGBE_FDIRM_L3P                         0x00000008
-#define IXGBE_FDIRM_L4P                         0x00000010
-#define IXGBE_FDIRM_FLEX                        0x00000020
-#define IXGBE_FDIRM_DIPv6                       0x00000040
+#define IXGBE_FDIRM_L4P                         0x00000008
+#define IXGBE_FDIRM_FLEX                        0x00000010
+#define IXGBE_FDIRM_DIPv6                       0x00000020
 
 #define IXGBE_FDIRFREE_FREE_MASK                0xFFFF
 #define IXGBE_FDIRFREE_FREE_SHIFT               0
@@ -2215,12 +2214,13 @@ union ixgbe_atr_hash_dword {
 };
 
 struct ixgbe_atr_input_masks {
-	__be32 src_ip_mask;
-	__be32 dst_ip_mask;
+	__be16 rsvd0;
+	__be16 vlan_id_mask;
+	__be32 dst_ip_mask[4];
+	__be32 src_ip_mask[4];
 	__be16 src_port_mask;
 	__be16 dst_port_mask;
-	__be16 vlan_id_mask;
-	__be16 data_mask;
+	__be16 flex_mask;
 };
 
 enum ixgbe_eeprom_type {
-- 
1.7.3.4


^ permalink raw reply related

* Re: [net-next 00/12][pull-request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2011-01-07  0:37 UTC (permalink / raw)
  To: davem@davemloft.net
  Cc: netdev@vger.kernel.org, gospo@redhat.com, bphilips@novell.com
In-Reply-To: <1294360199-9860-1-git-send-email-jeffrey.t.kirsher@intel.com>

[-- Attachment #1: Type: text/plain, Size: 2689 bytes --]

On Thu, 2011-01-06 at 16:29 -0800, Kirsher, Jeffrey T wrote:
> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> 
> The following series contains ixgbe/e1000e cleanups and fixes.  The
> addition of CE4100 support in e1000, and ixgb VLAN conversion to the
> new model.
> 
> The following changes since commit dbbe68bb12b34f3e450da7a73c20e6fa1f85d63a:
> 
>   Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
> 
> are available in the git repository at:
> 
>   master.kernel.org:/pub/scm/linux/kernel/git/jkirsher/net-next-2.6.git master
> 
> Alexander Duyck (3):
>   ixgbe: cleanup flow director hash computation to improve performance
>   ixgbe: further flow director performance optimizations
>   ixgbe: update ntuple filter configuration
> 
> Bruce Allan (6):
>   e1000e: cleanup variables set but not used
>   e1000e: convert calls of ops.[read|write]_reg to e1e_[r|w]phy
>   e1000e: properly bounds-check string functions
>   e1000e: use either_crc_le() rather than re-write it
>   e1000e: power off PHY after reset when interface is down
>   e1000e: add custom set_d[0|3]_lplu_state function pointer for 82574
> 
> Dirk Brandewie (1):
>   e1000: Add support for the CE4100 reference platform
> 
> Emil Tantilov (1):
>   ixgb: convert to new VLAN model
> 
> Yi Zou (1):
>   ixgbe: make sure per Rx queue is disabled before unmapping the
>     receive buffer
> 
>  drivers/net/e1000/e1000_hw.c      |  328 +++++++++++++----
>  drivers/net/e1000/e1000_hw.h      |   59 +++-
>  drivers/net/e1000/e1000_main.c    |   35 ++
>  drivers/net/e1000/e1000_osdep.h   |   19 +-
>  drivers/net/e1000e/82571.c        |   77 ++++-
>  drivers/net/e1000e/e1000.h        |    3 +
>  drivers/net/e1000e/es2lan.c       |    4 +-
>  drivers/net/e1000e/ethtool.c      |   54 ++-
>  drivers/net/e1000e/hw.h           |    1 +
>  drivers/net/e1000e/ich8lan.c      |   77 ++---
>  drivers/net/e1000e/lib.c          |    3 +-
>  drivers/net/e1000e/netdev.c       |   53 ++--
>  drivers/net/e1000e/phy.c          |   40 +--
>  drivers/net/ixgb/ixgb.h           |    2 +-
>  drivers/net/ixgb/ixgb_ethtool.c   |   35 ++
>  drivers/net/ixgb/ixgb_main.c      |   54 +--
>  drivers/net/ixgbe/ixgbe.h         |   21 +-
>  drivers/net/ixgbe/ixgbe_82599.c   |  749 +++++++++++++++----------------------
>  drivers/net/ixgbe/ixgbe_ethtool.c |  142 +++++---
>  drivers/net/ixgbe/ixgbe_main.c    |  169 ++++++---
>  drivers/net/ixgbe/ixgbe_type.h    |   91 +++--
>  21 files changed, 1182 insertions(+), 834 deletions(-)
> 

I apologize, I fat fingered Andy Gospodarek's email address.  I have
corrected it in this response.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* linux-next: manual merge of the security-testing tree with the net tree
From: Stephen Rothwell @ 2011-01-07  0:44 UTC (permalink / raw)
  To: James Morris
  Cc: linux-next, linux-kernel, David Miller, netdev, Casey Schaufler

Hi James,

Today's linux-next merge of the security-testing tree got a conflict in
security/smack/smack_lsm.c between commit
3610cda53f247e176bcbb7a7cca64bc53b12acdb ("af_unix: Avoid socket->sk NULL
OOPS in stream connect security hooks") from the net tree and commit
b4e0d5f0791bd6dd12a1c1edea0340969c7c1f90 ("Smack: UDS revision") from the
security-testing tree.

I fixed it up (I think - see below) and can carry the fix as necessary.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc security/smack/smack_lsm.c
index ccb71a0,05dc4da..0000000
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@@ -2415,17 -2534,21 +2534,21 @@@ static int smack_setprocattr(struct tas
   * Return 0 if a subject with the smack of sock could access
   * an object with the smack of other, otherwise an error code
   */
 -static int smack_unix_stream_connect(struct socket *sock,
 -				     struct socket *other, struct sock *newsk)
 +static int smack_unix_stream_connect(struct sock *sock,
 +				     struct sock *other, struct sock *newsk)
  {
- 	struct inode *sp = SOCK_INODE(sock->sk_socket);
- 	struct inode *op = SOCK_INODE(other->sk_socket);
 -	struct socket_smack *ssp = sock->sk->sk_security;
 -	struct socket_smack *osp = other->sk->sk_security;
++	struct socket_smack *ssp = sock->sk_security;
++	struct socket_smack *osp = other->sk_security;
  	struct smk_audit_info ad;
+ 	int rc = 0;
  
  	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
 -	smk_ad_setfield_u_net_sk(&ad, other->sk);
 +	smk_ad_setfield_u_net_sk(&ad, other);
- 	return smk_access(smk_of_inode(sp), smk_of_inode(op),
- 				 MAY_READWRITE, &ad);
+ 
+ 	if (!capable(CAP_MAC_OVERRIDE))
+ 		rc = smk_access(ssp->smk_out, osp->smk_in, MAY_WRITE, &ad);
+ 
+ 	return rc;
  }
  
  /**

^ permalink raw reply

* Re: [PATCH v2] net: Allow ethtool to set interface in loopback mode.
From: Mahesh Bandewar @ 2011-01-07  0:47 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Jeff Garzik, Stephen Hemminger, David Miller, Laurent Chavey,
	Tom Herbert, netdev
In-Reply-To: <1294352011.11825.50.camel@bwh-desktop>

On Thu, Jan 6, 2011 at 2:13 PM, Ben Hutchings <bhutchings@solarflare.com> wrote:
> On Wed, 2011-01-05 at 11:22 -0500, Jeff Garzik wrote:
>> On 01/04/2011 08:21 PM, Ben Hutchings wrote:
>> > On Tue, 2011-01-04 at 16:36 -0800, Stephen Hemminger wrote:
>> >> On Tue,  4 Jan 2011 16:30:01 -0800
>> >> Mahesh Bandewar<maheshb@google.com>  wrote:
>> >>
>> >>> This patch enables ethtool to set the loopback mode on a given interface.
>> >>> By configuring the interface in loopback mode in conjunction with a policy
>> >>> route / rule, a userland application can stress the egress / ingress path
>> >>> exposing the flows of the change in progress and potentially help developer(s)
>> >>> understand the impact of those changes without even sending a packet out
>> >>> on the network.
>> >>>
>> >>> Following set of commands illustrates one such example -
>> >>>   a) ip -4 addr add 192.168.1.1/24 dev eth1
>> >>>   b) ip -4 rule add from all iif eth1 lookup 250
>> >>>   c) ip -4 route add local 0/0 dev lo proto kernel scope host table 250
>> >>>   d) arp -Ds 192.168.1.100 eth1
>> >>>   e) arp -Ds 192.168.1.200 eth1
>> >>>   f) sysctl -w net.ipv4.ip_nonlocal_bind=1
>> >>>   g) sysctl -w net.ipv4.conf.all.accept_local=1
>> >>>   # Assuming that the machine has 8 cores
>> >>>   h) taskset 000f netserver -L 192.168.1.200
>> >>>   i) taskset 00f0 netperf -t TCP_CRR -L 192.168.1.100 -H 192.168.1.200 -l 30
>> >>>
>> >>> Signed-off-by: Mahesh Bandewar<maheshb@google.com>
>> >>> Reviewed-by: Ben Hutchings<bhutchings@solarflare.com>
>> >>
>> >> Since this is a boolean it SHOULD go into ethtool_flags rather than
>> >> being a high level operation.
>> >
>> > It could do, but I though ETHTOOL_{G,S}FLAGS were intended for
>> > controlling offload features.
>>
>> It doesn't have to be.  As Stephen guessed, [GS]FLAGS are basically
>> common flags -- as differentiated from private,
>> driver-specific/hardware-specific flags.
>
> Well, that would allow the patch to be simplified quite a bit. :-)

Ben, Are you suggesting to use ETH_FLAG_LOOPBACK instead of
ETHTOOL_{G|S}LOOPBACK flags?

Thanks,
--mahesh..

>
> Ben.
>
> From: Ben Hutchings <bhutchings@solarflare.com>
> Subject: [PATCH net-2.6] ethtool: Define ETH_FLAG_LOOPBACK
> Date: Thu, 6 Jan 2011 22:10:55 +0000
>
> Mahesh Bandewar <maheshb@google.com> requested this, writing:
>
> By configuring the interface in loopback mode in conjunction with a policy
> route / rule, a userland application can stress the egress / ingress path
> exposing the flows of the change in progress and potentially help developer(s)
> understand the impact of those changes without even sending a packet out
> on the network.
>
> Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
>
> --- a/include/linux/ethtool.h
> +++ b/include/linux/ethtool.h
> @@ -309,6 +309,7 @@ struct ethtool_perm_addr {
>  * flag differs from the read-only value.
>  */
>  enum ethtool_flags {
> +       ETH_FLAG_LOOPBACK       = (1 << 2),     /* Host-side loopback enabled */
>        ETH_FLAG_TXVLAN         = (1 << 7),     /* TX VLAN offload enabled */
>        ETH_FLAG_RXVLAN         = (1 << 8),     /* RX VLAN offload enabled */
>        ETH_FLAG_LRO            = (1 << 15),    /* LRO is enabled */
> ---
>
> --
> Ben Hutchings, Senior Software Engineer, Solarflare Communications
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>
>

^ permalink raw reply

* Re: [net-next 03/12] e1000e: properly bounds-check string functions
From: Ben Hutchings @ 2011-01-07  0:48 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: davem, Bruce Allan, netdev, gosp, bphilips
In-Reply-To: <1294360199-9860-4-git-send-email-jeffrey.t.kirsher@intel.com>

On Thu, 2011-01-06 at 16:29 -0800, jeffrey.t.kirsher@intel.com wrote:
> From: Bruce Allan <bruce.w.allan@intel.com>
> 
> Use string functions with bounds checking rather than their non-bounds
> checking counterparts, and do not hard code these boundaries.
[...]
> --- a/drivers/net/e1000e/netdev.c
> +++ b/drivers/net/e1000e/netdev.c
[...]
> @@ -5968,7 +5968,7 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
>  	if (!(adapter->flags & FLAG_HAS_AMT))
>  		e1000_get_hw_control(adapter);
>  
> -	strcpy(netdev->name, "eth%d");
> +	strncpy(netdev->name, "eth%d", sizeof(netdev->name) - 1);
>  	err = register_netdev(netdev);
>  	if (err)
>  		goto err_register;
[...]

This statement is actually redundant - alloc_etherdev() sets the name
for you.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: 2.6.37 vlans on bnx2 not functional, panic with tcpdump
From: Michael Chan @ 2011-01-07  0:46 UTC (permalink / raw)
  To: Iain Paton; +Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1294357941.21580.2.camel@HP1>


On Thu, 2011-01-06 at 15:52 -0800, Michael Chan wrote:
> On Thu, 2011-01-06 at 13:32 -0800, Iain Paton wrote:
> > Hi,
> > 
> > vlans don't appear to be functional on my HP DL380G6 with onboard bnx2
> > adapter using vanilla 2.6.37 kernel. No tagged vlan traffic 
> > is arriving at the vlan interface.
> 
> VLANs on net-next-2.6 kernel works for me on bnx2 devices.  I'll try
> 2.6.37 next.

May be you have management firmware running on your devices that can
change the behavior.  Can you provide ethtool -i eth0 on both bnx2
devices on your system?



^ permalink raw reply

* Re: [net-next 12/12] ixgbe: update ntuple filter configuration
From: Ben Hutchings @ 2011-01-07  1:02 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: davem, Alexander Duyck, netdev, gosp, bphilips
In-Reply-To: <1294360199-9860-13-git-send-email-jeffrey.t.kirsher@intel.com>

On Thu, 2011-01-06 at 16:29 -0800, jeffrey.t.kirsher@intel.com wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This change fixes several issues found in ntuple filtering while I was
> doing the ATR refactor.
> 
> Specifically I updated the masks to work correctly with the latest version
> of ethtool,
[...]

Did the previous code not correctly handle a zero value with a non-zero
mask for some fields?  If so, I can revert that change to ethtool.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: linux-next: manual merge of the security-testing tree with the net tree
From: Casey Schaufler @ 2011-01-07  1:05 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: James Morris, linux-next, linux-kernel, David Miller, netdev,
	Casey Schaufler
In-Reply-To: <20110107114400.e4d1d33b.sfr@canb.auug.org.au>

On 1/6/2011 4:44 PM, Stephen Rothwell wrote:
> Hi James,
>
> Today's linux-next merge of the security-testing tree got a conflict in
> security/smack/smack_lsm.c between commit
> 3610cda53f247e176bcbb7a7cca64bc53b12acdb ("af_unix: Avoid socket->sk NULL
> OOPS in stream connect security hooks") from the net tree and commit
> b4e0d5f0791bd6dd12a1c1edea0340969c7c1f90 ("Smack: UDS revision") from the
> security-testing tree.
>
> I fixed it up (I think - see below) and can carry the fix as necessary.

The change looks like it addresses the change in interface. Thank you.

^ permalink raw reply

* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Eric Dumazet @ 2011-01-07  1:20 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Matt Carlson, Michael Leun, Michael Chan, David Miller,
	Ben Greear, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <1294356887.2704.13.camel@edumazet-laptop>

Le vendredi 07 janvier 2011 à 00:34 +0100, Eric Dumazet a écrit :
> Le jeudi 06 janvier 2011 à 16:01 -0500, Jesse Gross a écrit :
> 
> > Hmm, I thought that it might be some interaction with a corner case in
> > the networking core but now it seems less likely.  There weren't too
> > many vlan changes between the working and non-working states.  Plus,
> > since the rx counter isn't increasing, the packets probably aren't
> > making it anywhere.
> > 
> > I see that tg3 increases the drop counter in one place, which also
> > happens to be checking for vlan errors (at tg3.c:4753).  That seems
> > suspicious - maybe the NIC is only partially configured for vlan
> > offloading.  If we can confirm that is where the drop counter is being
> > incremented and what the error code is maybe it would shed some light.
> > 
> 
> Hmm... I am pretty sure the drop counter is the dev rx_dropped (core
> network handled, not tg3 one) incremented at the end of
> __netif_receive_skb() : We found no suitable handler for packets.
> 
> atomic_long_inc(&skb->dev->rx_dropped);
> 
> But thats a guess, I'll have to check
> 

wrong guess. Its really the tg3 which drops frames

increasing rx_missed_errors  (get_stat64(&hw_stats->rx_discards)

ip -s -s link show dev eth2
5: eth2: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq
master bond0 state UP qlen 1000
    link/ether 00:1e:0b:92:78:50 brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast   
    11627      167      0       0       0       2      
    RX errors: length  crc     frame   fifo    missed
               0        0       0       0       2713   
    TX: bytes  packets  errors  dropped carrier collsns 
    2274       31       0       0       0       0      
    TX errors: aborted fifo    window  heartbeat
               0        0       0       0      



It would be nice Broadcom guys could help a bit ?

^ permalink raw reply

* Re: Flow Control and Port Mirroring Revisited
From: Simon Horman @ 2011-01-07  1:23 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Eric Dumazet, Rusty Russell, virtualization, dev, virtualization,
	netdev, kvm, Michael S. Tsirkin
In-Reply-To: <AANLkTinJK-nbkP5_ee2cuS8RA7jTB4-bcWmAf4bjSouP@mail.gmail.com>

On Thu, Jan 06, 2011 at 05:38:01PM -0500, Jesse Gross wrote:

[ snip ]
> 
> I know that everyone likes a nice netperf result but I agree with
> Michael that this probably isn't the right question to be asking.  I
> don't think that socket buffers are a real solution to the flow
> control problem: they happen to provide that functionality but it's
> more of a side effect than anything.  It's just that the amount of
> memory consumed by packets in the queue(s) doesn't really have any
> implicit meaning for flow control (think multiple physical adapters,
> all with the same speed instead of a virtual device and a physical
> device with wildly different speeds).  The analog in the physical
> world that you're looking for would be Ethernet flow control.
> Obviously, if the question is limiting CPU or memory consumption then
> that's a different story.

Point taken. I will see if I can control CPU (and thus memory) consumption
using cgroups and/or tc.

> This patch also double counts memory, since the full size of the
> packet will be accounted for by each clone, even though they share the
> actual packet data.  Probably not too significant here but it might be
> when flooding/mirroring to many interfaces.  This is at least fixable
> (the Xen-style accounting through page tracking deals with it, though
> it has its own problems).

Agreed on all counts.



^ permalink raw reply

* About disabling congestion control
From: Syed Obaid Amin @ 2011-01-07  1:25 UTC (permalink / raw)
  To: netdev

Hey all,

I am currently working on a socket option to disable the tcp
congestion control. I think the simplest approach to do this is to
ignore cwnd before sending out a packet.

After going through tcp output engine it seems that tcp_cwnd_test is
the method that decides that how many segments can be sent out on a
wire. For testing it out, I changed this method so that if no-cc
option is ON, just return a big constant value. But, it didn't work
and I am unable to see a burst of pkts. It looks like that I am
missing something here.

Any suggestions that what is the right place to look for disabling the
congestion control ?

Thanks much!

Obaid

^ permalink raw reply

* Re: [PATCH v2] net: Allow ethtool to set interface in loopback mode.
From: Ben Hutchings @ 2011-01-07  1:30 UTC (permalink / raw)
  To: Mahesh Bandewar
  Cc: Jeff Garzik, Stephen Hemminger, David Miller, Laurent Chavey,
	Tom Herbert, netdev
In-Reply-To: <AANLkTi=3ewzgz=z-WmqT=vBvci3-H6HE3CCVk4ZGuFED@mail.gmail.com>

On Thu, 2011-01-06 at 16:47 -0800, Mahesh Bandewar wrote:
> On Thu, Jan 6, 2011 at 2:13 PM, Ben Hutchings <bhutchings@solarflare.com> wrote:
> > On Wed, 2011-01-05 at 11:22 -0500, Jeff Garzik wrote:
> >> On 01/04/2011 08:21 PM, Ben Hutchings wrote:
> >> > On Tue, 2011-01-04 at 16:36 -0800, Stephen Hemminger wrote:
> >> >> On Tue,  4 Jan 2011 16:30:01 -0800
> >> >> Mahesh Bandewar<maheshb@google.com>  wrote:
> >> >>
> >> >>> This patch enables ethtool to set the loopback mode on a given interface.
> >> >>> By configuring the interface in loopback mode in conjunction with a policy
> >> >>> route / rule, a userland application can stress the egress / ingress path
> >> >>> exposing the flows of the change in progress and potentially help developer(s)
> >> >>> understand the impact of those changes without even sending a packet out
> >> >>> on the network.
> >> >>>
> >> >>> Following set of commands illustrates one such example -
> >> >>>   a) ip -4 addr add 192.168.1.1/24 dev eth1
> >> >>>   b) ip -4 rule add from all iif eth1 lookup 250
> >> >>>   c) ip -4 route add local 0/0 dev lo proto kernel scope host table 250
> >> >>>   d) arp -Ds 192.168.1.100 eth1
> >> >>>   e) arp -Ds 192.168.1.200 eth1
> >> >>>   f) sysctl -w net.ipv4.ip_nonlocal_bind=1
> >> >>>   g) sysctl -w net.ipv4.conf.all.accept_local=1
> >> >>>   # Assuming that the machine has 8 cores
> >> >>>   h) taskset 000f netserver -L 192.168.1.200
> >> >>>   i) taskset 00f0 netperf -t TCP_CRR -L 192.168.1.100 -H 192.168.1.200 -l 30
> >> >>>
> >> >>> Signed-off-by: Mahesh Bandewar<maheshb@google.com>
> >> >>> Reviewed-by: Ben Hutchings<bhutchings@solarflare.com>
> >> >>
> >> >> Since this is a boolean it SHOULD go into ethtool_flags rather than
> >> >> being a high level operation.
> >> >
> >> > It could do, but I though ETHTOOL_{G,S}FLAGS were intended for
> >> > controlling offload features.
> >>
> >> It doesn't have to be.  As Stephen guessed, [GS]FLAGS are basically
> >> common flags -- as differentiated from private,
> >> driver-specific/hardware-specific flags.
> >
> > Well, that would allow the patch to be simplified quite a bit. :-)
> 
> Ben, Are you suggesting to use ETH_FLAG_LOOPBACK instead of
> ETHTOOL_{G|S}LOOPBACK flags?
[...]

Exactly.

An example implementation (untested):

--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -548,11 +548,24 @@ static u32 efx_ethtool_get_rx_csum(struct net_device *net_dev)
 	return efx->rx_checksum_enabled;
 }
 
+static u32 efx_ethtool_get_flags(struct net_device *net_dev)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+	u32 flags;
+
+	flags = ethtool_op_get_flags(net_dev);
+	if (efx->loopback_mode != LOOPBACK_NONE)
+		flags |= ETH_FLAG_LOOPBACK;
+	return flags;
+}
+
 static int efx_ethtool_set_flags(struct net_device *net_dev, u32 data)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
-	u32 supported = (efx->type->offload_features &
-			 (ETH_FLAG_RXHASH | ETH_FLAG_NTUPLE));
+	u32 supported = (ETH_FLAG_LOOPBACK |
+			 (efx->type->offload_features &
+			  (ETH_FLAG_RXHASH | ETH_FLAG_NTUPLE)));
+	enum efx_loopback_mode loopback;
 	int rc;
 
 	rc = ethtool_op_set_flags(net_dev, data, supported);
@@ -562,7 +575,15 @@ static int efx_ethtool_set_flags(struct net_device *net_dev, u32 data)
 	if (!(data & ETH_FLAG_NTUPLE))
 		efx_filter_clear_rx(efx, EFX_FILTER_PRI_MANUAL);
 
-	return 0;
+	loopback = (data & ETH_FLAG_LOOPBACK) ? LOOPBACK_DATA : LOOPBACK_NONE;
+	mutex_lock(&efx->mac_lock);
+	if (efx->loopback_mode != loopback) {
+		efx->loopback_mode = loopback;
+		rc = __efx_reconfigure_port(efx);
+	}
+	mutex_unlock(&efx->mac_lock);
+
+	return rc;
 }
 
 static void efx_ethtool_self_test(struct net_device *net_dev,
@@ -1057,7 +1078,7 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.get_tso		= ethtool_op_get_tso,
 	/* Need to enable/disable TSO-IPv6 too */
 	.set_tso		= efx_ethtool_set_tso,
-	.get_flags		= ethtool_op_get_flags,
+	.get_flags		= efx_ethtool_get_flags,
 	.set_flags		= efx_ethtool_set_flags,
 	.get_sset_count		= efx_ethtool_get_sset_count,
 	.self_test		= efx_ethtool_self_test,
---

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: genetlink misinterprets NEW as GET
From: Pablo Neira Ayuso @ 2011-01-07  1:31 UTC (permalink / raw)
  To: Ben Pfaff
  Cc: Jan Engelhardt, Netfilter Developer Mailing List,
	Linux Networking Developer Mailing List
In-Reply-To: <878vyyvtci.fsf@benpfaff.org>

On 06/01/11 18:23, Ben Pfaff wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> writes:
> 
>> On 04/01/11 03:14, Jan Engelhardt wrote:
>>> 	/* Modifiers to GET request */
>>> 	#define NLM_F_ROOT      0x100
>>> 	#define NLM_F_MATCH     0x200
>>> 	#define NLM_F_ATOMIC    0x400
>>> 	#define NLM_F_DUMP      (NLM_F_ROOT|NLM_F_MATCH)
> [...]
>>> [N.B.: I am also wondering whether
>>> 	(nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP
>>> may have been desired, because NLM_F_DUMP is composed of two bits.]
>>
>> Someone may include NLM_F_ATOMIC to a dump operation, in that case the
>> checking that you propose is not valid.
> 
> Are you saying that NLM_F_MATCH and NLM_F_ATOMIC are mutually
> exclusive, and that NLM_F_ROOT|NLM_F_ATOMIC would also signal a
> dump operation?  Otherwise the test that Jan proposes looks valid
> to me.

Indeed, Jan's test is fine to fix this. Please, send a patch to Davem asap.

^ permalink raw reply

* Re: About disabling congestion control
From: Stephen Hemminger @ 2011-01-07  1:37 UTC (permalink / raw)
  To: Syed Obaid Amin; +Cc: netdev
In-Reply-To: <AANLkTin53drsv18TZ0CPZM7==6G=rK6vScUb=SCQ4-xr@mail.gmail.com>

On Thu, 6 Jan 2011 20:25:18 -0500
Syed Obaid Amin <obaidasyed@gmail.com> wrote:

> Hey all,
> 
> I am currently working on a socket option to disable the tcp
> congestion control. I think the simplest approach to do this is to
> ignore cwnd before sending out a packet.
> 
> After going through tcp output engine it seems that tcp_cwnd_test is
> the method that decides that how many segments can be sent out on a
> wire. For testing it out, I changed this method so that if no-cc
> option is ON, just return a big constant value. But, it didn't work
> and I am unable to see a burst of pkts. It looks like that I am
> missing something here.
> 
> Any suggestions that what is the right place to look for disabling the
> congestion control ?
> 
> Thanks much!
> 
> Obaid

I assume this is just a local hack experiment; not something you
want to actually submit for other users to use...

The easiest/safest way to do this would be to build/define a new TCP congestion
control type that does nothing.


^ permalink raw reply

* Re: Bad TCP timestamps on non-PC platforms
From: Alex Dubov @ 2011-01-07  1:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David Miller
In-Reply-To: <1294301356.2723.73.camel@edumazet-laptop>

Sorry for the awful synopsis of my problem. I never cease to amaze myself
at how bad those usually turn up. :-)

What I really meant to write is:

I have a dev board running 2.6.37-rc7. Normal kernel config, nothing fancy.
Remote machines are just usual linux boxes in constant operation (I tried
several of those).

UDP/DHCP works correctly all the time, so ethernet side is probably ok.

When tcp_timestamps are enabled, SYN packets from dev board just get
ignored by the remote side. I see them arrive in wireshark, but nothing
else happens.

When I disable tcp_timestamps on the dev board everything works.

The problem is reproducible every single time.

The only difference is the "Options" block of the SYN packets.
If timestamps are not really to blame, then it probably window scale
parameters. That's what I see on a usual dropped packet:

Options: (20 bytes)
        Maximum segment size: 1460 bytes
        SACK permitted
        Timestamps: TSval 4294893842, TSecr 0
        NOP
        Window scale: 5 (multiply by 32)



      

^ permalink raw reply

* Re: Bad TCP timestamps on non-PC platforms
From: Eric Dumazet @ 2011-01-07  2:11 UTC (permalink / raw)
  To: Alex Dubov; +Cc: netdev, David Miller
In-Reply-To: <117536.54377.qm@web37607.mail.mud.yahoo.com>

Le jeudi 06 janvier 2011 à 17:55 -0800, Alex Dubov a écrit :
> Sorry for the awful synopsis of my problem. I never cease to amaze myself
> at how bad those usually turn up. :-)
> 
> What I really meant to write is:
> 
> I have a dev board running 2.6.37-rc7. Normal kernel config, nothing fancy.
> Remote machines are just usual linux boxes in constant operation (I tried
> several of those).
> 
> UDP/DHCP works correctly all the time, so ethernet side is probably ok.
> 
> When tcp_timestamps are enabled, SYN packets from dev board just get
> ignored by the remote side. I see them arrive in wireshark, but nothing
> else happens.
> 
> When I disable tcp_timestamps on the dev board everything works.
> 
> The problem is reproducible every single time.
> 
> The only difference is the "Options" block of the SYN packets.
> If timestamps are not really to blame, then it probably window scale
> parameters. That's what I see on a usual dropped packet:
> 
> Options: (20 bytes)
>         Maximum segment size: 1460 bytes
>         SACK permitted
>         Timestamps: TSval 4294893842, TSecr 0
>         NOP
>         Window scale: 5 (multiply by 32)
> 
> 
> 
>       


You dont give new informations ;)

I asked if you could give information on the other side : The bug is to
drop this legal packet.

uname -a
sysctl -a | grep tcp



^ permalink raw reply

* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Matt Carlson @ 2011-01-07  2:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesse Gross, Matthew Carlson, Michael Leun, Michael Chan,
	David Miller, Ben Greear, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <1294363201.2704.19.camel@edumazet-laptop>

On Thu, Jan 06, 2011 at 05:20:01PM -0800, Eric Dumazet wrote:
> Le vendredi 07 janvier 2011 ?? 00:34 +0100, Eric Dumazet a ??crit :
> > Le jeudi 06 janvier 2011 ?? 16:01 -0500, Jesse Gross a ??crit :
> > 
> > > Hmm, I thought that it might be some interaction with a corner case in
> > > the networking core but now it seems less likely.  There weren't too
> > > many vlan changes between the working and non-working states.  Plus,
> > > since the rx counter isn't increasing, the packets probably aren't
> > > making it anywhere.
> > > 
> > > I see that tg3 increases the drop counter in one place, which also
> > > happens to be checking for vlan errors (at tg3.c:4753).  That seems
> > > suspicious - maybe the NIC is only partially configured for vlan
> > > offloading.  If we can confirm that is where the drop counter is being
> > > incremented and what the error code is maybe it would shed some light.
> > > 
> > 
> > Hmm... I am pretty sure the drop counter is the dev rx_dropped (core
> > network handled, not tg3 one) incremented at the end of
> > __netif_receive_skb() : We found no suitable handler for packets.
> > 
> > atomic_long_inc(&skb->dev->rx_dropped);
> > 
> > But thats a guess, I'll have to check
> > 
> 
> wrong guess. Its really the tg3 which drops frames
> 
> increasing rx_missed_errors  (get_stat64(&hw_stats->rx_discards)
> 
> ip -s -s link show dev eth2
> 5: eth2: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq
> master bond0 state UP qlen 1000
>     link/ether 00:1e:0b:92:78:50 brd ff:ff:ff:ff:ff:ff
>     RX: bytes  packets  errors  dropped overrun mcast   
>     11627      167      0       0       0       2      
>     RX errors: length  crc     frame   fifo    missed
>                0        0       0       0       2713   
>     TX: bytes  packets  errors  dropped carrier collsns 
>     2274       31       0       0       0       0      
>     TX errors: aborted fifo    window  heartbeat
>                0        0       0       0      
> 
> 
> 
> It would be nice Broadcom guys could help a bit ?

Hi Eric.  Sorry for the delay.  I was under the impression that your
problems were software related and that you just needed a revised
version of these VLAN patches I was sending to Michael.  Is this not
true?

Having a hardware stat increment suggests this is a new problem.
Maybe I missed it, but I didn't see what hardware you are working
with and whether or not management firmware was enabled.  Could you tell
me that info?

^ permalink raw reply

* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Eric Dumazet @ 2011-01-07  2:41 UTC (permalink / raw)
  To: Matt Carlson
  Cc: Jesse Gross, Michael Leun, Michael Chan, David Miller, Ben Greear,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20110107022912.GA17757@mcarlson.broadcom.com>

Le jeudi 06 janvier 2011 à 18:29 -0800, Matt Carlson a écrit :

> Hi Eric.  Sorry for the delay.  I was under the impression that your
> problems were software related and that you just needed a revised
> version of these VLAN patches I was sending to Michael.  Is this not
> true?
> 
> Having a hardware stat increment suggests this is a new problem.
> Maybe I missed it, but I didn't see what hardware you are working
> with and whether or not management firmware was enabled.  Could you tell
> me that info?
> 

Hi Matt

I started a bisection, because I couldnt sleep tonight anyway :(

14:04.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5715S
Gigabit Ethernet (rev a3)
	Subsystem: Hewlett-Packard Company NC326m PCIe Dual Port Adapter
	Flags: bus master, 66MHz, medium devsel, latency 64, IRQ 43
	Memory at fdff0000 (64-bit, non-prefetchable) [size=64K]
	Memory at fdfe0000 (64-bit, non-prefetchable) [size=64K]
	[virtual] Expansion ROM at fdbe0000 [disabled] [size=128K]
	Capabilities: [40] PCI-X non-bridge device
	Capabilities: [48] Power Management version 2
	Capabilities: [50] Vital Product Data
	Capabilities: [58] MSI: Enable+ Count=1/8 Maskable- 64bit+
	Kernel driver in use: tg3
	Kernel modules: tg3

^ permalink raw reply

* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Eric Dumazet @ 2011-01-07  2:43 UTC (permalink / raw)
  To: Matt Carlson
  Cc: Jesse Gross, Michael Leun, Michael Chan, David Miller, Ben Greear,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <1294368071.2704.49.camel@edumazet-laptop>

Le vendredi 07 janvier 2011 à 03:41 +0100, Eric Dumazet a écrit :
> Le jeudi 06 janvier 2011 à 18:29 -0800, Matt Carlson a écrit :
> 
> > Hi Eric.  Sorry for the delay.  I was under the impression that your
> > problems were software related and that you just needed a revised
> > version of these VLAN patches I was sending to Michael.  Is this not
> > true?
> > 
> > Having a hardware stat increment suggests this is a new problem.
> > Maybe I missed it, but I didn't see what hardware you are working
> > with and whether or not management firmware was enabled.  Could you tell
> > me that info?
> > 
> 
> Hi Matt
> 
> I started a bisection, because I couldnt sleep tonight anyway :(
> 
> 14:04.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5715S
> Gigabit Ethernet (rev a3)
> 	Subsystem: Hewlett-Packard Company NC326m PCIe Dual Port Adapter
> 	Flags: bus master, 66MHz, medium devsel, latency 64, IRQ 43
> 	Memory at fdff0000 (64-bit, non-prefetchable) [size=64K]
> 	Memory at fdfe0000 (64-bit, non-prefetchable) [size=64K]
> 	[virtual] Expansion ROM at fdbe0000 [disabled] [size=128K]
> 	Capabilities: [40] PCI-X non-bridge device
> 	Capabilities: [48] Power Management version 2
> 	Capabilities: [50] Vital Product Data
> 	Capabilities: [58] MSI: Enable+ Count=1/8 Maskable- 64bit+
> 	Kernel driver in use: tg3
> 	Kernel modules: tg3
> 
> 

$ ethtool -i eth2
driver: tg3
version: 3.115
firmware-version: 5715s-v3.28
bus-info: 0000:14:04.0
$ dmesg | grep ASF
[    6.220577] tg3 0000:14:04.0: eth2: RXcsums[1] LinkChgREG[0] MIirq[0]
ASF[0] TSOcap[1]
[    6.228586] tg3 0000:14:04.1: eth3: RXcsums[1] LinkChgREG[0] MIirq[0]
ASF[0] TSOcap[1]

^ permalink raw reply

* Re: [Bugme-new] [Bug 25062] New: Bonding packet deduplication doesn't work properly anymore
From: Jay Vosburgh @ 2011-01-07  2:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: netdev, bugzilla-daemon, bugme-new, bugme-daemon, kevin.lapagna
In-Reply-To: <20110104133936.60d389e2.akpm@linux-foundation.org>

Andrew Morton <akpm@linux-foundation.org> wrote:

>On Fri, 17 Dec 2010 11:45:18 GMT
>bugzilla-daemon@bugzilla.kernel.org wrote:
>
>> https://bugzilla.kernel.org/show_bug.cgi?id=25062
>> 
>>            Summary: Bonding packet deduplication doesn't work properly
>>                     anymore
>>            Product: Networking
>>            Version: 2.5
>>     Kernel Version: > 2.6.33
>>           Platform: All
>>         OS/Version: Linux
>>               Tree: Mainline
>>             Status: NEW
>>           Severity: high
>>           Priority: P1
>>          Component: Other
>>         AssignedTo: acme@ghostprotocols.net
>>         ReportedBy: kevin.lapagna@bigtag.ch
>>         Regression: No
>> 
>> 
>> Here's the setup:
>> 
>> switch: ordinary cisco switch
>> eth0: NIC with kernel module tg3
>> eth1: NIC with kernel module e1000e
>> bond0: bond with slaves eth0,eth1 in mode 1 (or 5)
>> bond0.100: vlan device created with vconfig
>> bridge100: bridge created with brctl
>> tap1: tap device created with tunctl
>> vguest: qemu-kvm vguest whit emulated e1000 NIC
>> 
>> 
>> |________________|-- eth0 \                                               |________________|
>> | switch |          -- bond0 -- bond0.100 -- bridge100 -- tap1 -- | vguest |
>> |________|-- eth1 /                                               |________|
>> 
>> When the vguest emits an ethernet broadcast (DHCP-request), it's forwarded all
>> the way up to the switch, through eth0. The switch forwards the broadcast -
>> also to eth1. The packet travels then all the way back to bridge100. So the
>> last status known for bridge100, regarding the mac address of the vgeust is,
>> that it is behind bond0.110 (instead of tap1). If a DHCP-server responds to the
>> request, the packet travels to bridge100, which has now a faulty
>> MAC-address-table and the packet will be rejected and never reaches tap1 and
>> therefor not the vguest.
>> 
>> I witnessed this wrong behavior in kernel 2.6.37-rc5 (debian package), 2.6.36.2
>> and 2.6.35.9 (self compiled -  vanilla). The setup has worked with kernels <=
>> 2.6.33.7. I've never tried 2.6.34.
>> 
>> I assume the setup above is a common way for the separation of virtual guests
>> on a network level. So this could become a major issue for a lot of people when
>> upgrading their kernels.

	Just a note that I have reproduced what I believe is the same
problem (I didn't use tap, and assigned an IP to the bridge).  I used
arping to generate ethernet broadcasts.  I see the problem on 2.6.36.2,
but not on today's net-next-2.6.

	I'll see if I can dig up the root cause tomorrow.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Matt Carlson @ 2011-01-07  2:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Matthew Carlson, Jesse Gross, Michael Leun, Michael Chan,
	David Miller, Ben Greear, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <1294368202.2704.50.camel@edumazet-laptop>

On Thu, Jan 06, 2011 at 06:43:22PM -0800, Eric Dumazet wrote:
> Le vendredi 07 janvier 2011 ?? 03:41 +0100, Eric Dumazet a ??crit :
> > Le jeudi 06 janvier 2011 ?? 18:29 -0800, Matt Carlson a ??crit :
> > 
> > > Hi Eric.  Sorry for the delay.  I was under the impression that your
> > > problems were software related and that you just needed a revised
> > > version of these VLAN patches I was sending to Michael.  Is this not
> > > true?
> > > 
> > > Having a hardware stat increment suggests this is a new problem.
> > > Maybe I missed it, but I didn't see what hardware you are working
> > > with and whether or not management firmware was enabled.  Could you tell
> > > me that info?
> > > 
> > 
> > Hi Matt
> > 
> > I started a bisection, because I couldnt sleep tonight anyway :(
> > 
> > 14:04.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5715S
> > Gigabit Ethernet (rev a3)
> > 	Subsystem: Hewlett-Packard Company NC326m PCIe Dual Port Adapter
> > 	Flags: bus master, 66MHz, medium devsel, latency 64, IRQ 43
> > 	Memory at fdff0000 (64-bit, non-prefetchable) [size=64K]
> > 	Memory at fdfe0000 (64-bit, non-prefetchable) [size=64K]
> > 	[virtual] Expansion ROM at fdbe0000 [disabled] [size=128K]
> > 	Capabilities: [40] PCI-X non-bridge device
> > 	Capabilities: [48] Power Management version 2
> > 	Capabilities: [50] Vital Product Data
> > 	Capabilities: [58] MSI: Enable+ Count=1/8 Maskable- 64bit+
> > 	Kernel driver in use: tg3
> > 	Kernel modules: tg3
> > 
> > 
> 
> $ ethtool -i eth2
> driver: tg3
> version: 3.115
> firmware-version: 5715s-v3.28
> bus-info: 0000:14:04.0
> $ dmesg | grep ASF
> [    6.220577] tg3 0000:14:04.0: eth2: RXcsums[1] LinkChgREG[0] MIirq[0]
> ASF[0] TSOcap[1]
> [    6.228586] tg3 0000:14:04.1: eth3: RXcsums[1] LinkChgREG[0] MIirq[0]
> ASF[0] TSOcap[1]

Thanks.  So management firmware is disabled.  This should be
straightforward case.

I'm wondering if I'm misunderstanding something though.  You said earlier
that VLAN tagging doesn't work unless you applied my patch.  Is this no
longer true?

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Paul Mackerras @ 2011-01-07  3:01 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, Harvey Harrison, linux-ppp, netdev
In-Reply-To: <1294357056-25889-1-git-send-email-xiaosuo@gmail.com>

On Fri, Jan 07, 2011 at 07:37:36AM +0800, Changli Gao wrote:

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

This patch description is inadequate.  It should tell us why you are
making this change.  Does it result in smaller and/or faster code, and
if so by how much on what sort of machine?  Do you think it makes the
code clearer?  (I don't.)  Or is there some other motivation for this?

Paul.

^ permalink raw reply

* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Eric Dumazet @ 2011-01-07  3:04 UTC (permalink / raw)
  To: Matt Carlson
  Cc: Jesse Gross, Michael Leun, Michael Chan, David Miller, Ben Greear,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20110107025930.GA17808@mcarlson.broadcom.com>

Le jeudi 06 janvier 2011 à 18:59 -0800, Matt Carlson a écrit :
> On Thu, Jan 06, 2011 at 06:43:22PM -0800, Eric Dumazet wrote:
> > Le vendredi 07 janvier 2011 ?? 03:41 +0100, Eric Dumazet a ??crit :
> > > Le jeudi 06 janvier 2011 ?? 18:29 -0800, Matt Carlson a ??crit :
> > > 
> > > > Hi Eric.  Sorry for the delay.  I was under the impression that your
> > > > problems were software related and that you just needed a revised
> > > > version of these VLAN patches I was sending to Michael.  Is this not
> > > > true?
> > > > 
> > > > Having a hardware stat increment suggests this is a new problem.
> > > > Maybe I missed it, but I didn't see what hardware you are working
> > > > with and whether or not management firmware was enabled.  Could you tell
> > > > me that info?
> > > > 
> > > 
> > > Hi Matt
> > > 
> > > I started a bisection, because I couldnt sleep tonight anyway :(
> > > 
> > > 14:04.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5715S
> > > Gigabit Ethernet (rev a3)
> > > 	Subsystem: Hewlett-Packard Company NC326m PCIe Dual Port Adapter
> > > 	Flags: bus master, 66MHz, medium devsel, latency 64, IRQ 43
> > > 	Memory at fdff0000 (64-bit, non-prefetchable) [size=64K]
> > > 	Memory at fdfe0000 (64-bit, non-prefetchable) [size=64K]
> > > 	[virtual] Expansion ROM at fdbe0000 [disabled] [size=128K]
> > > 	Capabilities: [40] PCI-X non-bridge device
> > > 	Capabilities: [48] Power Management version 2
> > > 	Capabilities: [50] Vital Product Data
> > > 	Capabilities: [58] MSI: Enable+ Count=1/8 Maskable- 64bit+
> > > 	Kernel driver in use: tg3
> > > 	Kernel modules: tg3
> > > 
> > > 
> > 
> > $ ethtool -i eth2
> > driver: tg3
> > version: 3.115
> > firmware-version: 5715s-v3.28
> > bus-info: 0000:14:04.0
> > $ dmesg | grep ASF
> > [    6.220577] tg3 0000:14:04.0: eth2: RXcsums[1] LinkChgREG[0] MIirq[0]
> > ASF[0] TSOcap[1]
> > [    6.228586] tg3 0000:14:04.1: eth3: RXcsums[1] LinkChgREG[0] MIirq[0]
> > ASF[0] TSOcap[1]
> 
> Thanks.  So management firmware is disabled.  This should be
> straightforward case.
> 
> I'm wondering if I'm misunderstanding something though.  You said earlier
> that VLAN tagging doesn't work unless you applied my patch.  Is this no
> longer true?
> 

I dont apply your patch because Jesse said it was not a good patch ;)

Maybe I missed something and it must be applied ? Problem is : current
Linus tree now includes net-next-2.6 and vlan doesnt work. You should
resubmit it perhaps ?

^ permalink raw reply

* [net-next-2.6 PATCH v6 1/2] net: implement mechanism for HW based QOS
From: John Fastabend @ 2011-01-07  3:12 UTC (permalink / raw)
  To: davem, jarkao2
  Cc: hadi, eric.dumazet, shemminger, tgraf, bhutchings, nhorman,
	netdev

This patch provides a mechanism for lower layer devices to
steer traffic using skb->priority to tx queues. This allows
for hardware based QOS schemes to use the default qdisc without
incurring the penalties related to global state and the qdisc
lock. While reliably receiving skbs on the correct tx ring
to avoid head of line blocking resulting from shuffling in
the LLD. Finally, all the goodness from txq caching and xps/rps
can still be leveraged.

Many drivers and hardware exist with the ability to implement
QOS schemes in the hardware but currently these drivers tend
to rely on firmware to reroute specific traffic, a driver
specific select_queue or the queue_mapping action in the
qdisc.

By using select_queue for this drivers need to be updated for
each and every traffic type and we lose the goodness of much
of the upstream work. Firmware solutions are inherently
inflexible. And finally if admins are expected to build a
qdisc and filter rules to steer traffic this requires knowledge
of how the hardware is currently configured. The number of tx
queues and the queue offsets may change depending on resources.
Also this approach incurs all the overhead of a qdisc with filters.

With the mechanism in this patch users can set skb priority using
expected methods ie setsockopt() or the stack can set the priority
directly. Then the skb will be steered to the correct tx queues
aligned with hardware QOS traffic classes. In the normal case with
a single traffic class and all queues in this class everything
works as is until the LLD enables multiple tcs.

To steer the skb we mask out the lower 4 bits of the priority
and allow the hardware to configure upto 15 distinct classes
of traffic. This is expected to be sufficient for most applications
at any rate it is more then the 8021Q spec designates and is
equal to the number of prio bands currently implemented in
the default qdisc.

This in conjunction with a userspace application such as
lldpad can be used to implement 8021Q transmission selection
algorithms one of these algorithms being the extended transmission
selection algorithm currently being used for DCB.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/netdevice.h |   65 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c            |   52 +++++++++++++++++++++++++++++++++++-
 2 files changed, 116 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0f6b1c9..12fff42 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -646,6 +646,14 @@ struct xps_dev_maps {
     (nr_cpu_ids * sizeof(struct xps_map *)))
 #endif /* CONFIG_XPS */
 
+#define TC_MAX_QUEUE	16
+#define TC_BITMASK	15
+/* HW offloaded queuing disciplines txq count and offset maps */
+struct netdev_tc_txq {
+	u16 count;
+	u16 offset;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -756,6 +764,7 @@ struct xps_dev_maps {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ * void (*ndo_setup_tc)(struct net_device *dev, u8 tc)
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -814,6 +823,8 @@ struct net_device_ops {
 						   struct nlattr *port[]);
 	int			(*ndo_get_vf_port)(struct net_device *dev,
 						   int vf, struct sk_buff *skb);
+	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc,
+						unsigned int txq);
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
@@ -1146,6 +1157,9 @@ struct net_device {
 	/* Data Center Bridging netlink ops */
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
+	u8 num_tc;
+	struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
+	u8 prio_tc_map[TC_BITMASK + 1];
 
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	/* max exchange id for FCoE LRO by ddp */
@@ -1162,6 +1176,57 @@ struct net_device {
 #define	NETDEV_ALIGN		32
 
 static inline
+int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
+{
+	return dev->prio_tc_map[prio & TC_BITMASK];
+}
+
+static inline
+int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
+	return 0;
+}
+
+static inline
+void netdev_reset_tc(struct net_device *dev)
+{
+	dev->num_tc = 0;
+	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+
+static inline
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->tc_to_txq[tc].count = count;
+	dev->tc_to_txq[tc].offset = offset;
+	return 0;
+}
+
+static inline
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+	if (num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	dev->num_tc = num_tc;
+	return 0;
+}
+
+static inline
+int netdev_get_num_tc(struct net_device *dev)
+{
+	return dev->num_tc;
+}
+
+static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index a215269..12a2c2a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1593,6 +1593,45 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_unlock();
 }
 
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this if the net_device supports ndo_setup_tc
+ * call the ops routine with the new queue number. If the ops is not
+ * available verify the tc mapping remains valid and if not NULL the
+ * mapping. With no priorities mapping to this offset/count pair it
+ * will no longer be used. In the worst case TC0 is invalid nothing
+ * can be done so disable priority mappings.
+ */
+void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_setup_tc) {
+		ops->ndo_setup_tc(dev, dev->num_tc, txq);
+	} else {
+		int i;
+		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+		/* If TC0 is invalidated disable TC mapping */
+		if (tc->offset + tc->count > txq) {
+			dev->num_tc = 0;
+			return;
+		}
+
+		/* Invalidated prio to tc mappings set to TC0 */
+		for (i = 1; i < TC_BITMASK + 1; i++) {
+			int q = netdev_get_prio_tc_map(dev, i);
+			tc = &dev->tc_to_txq[q];
+
+			if (tc->offset + tc->count > txq)
+				netdev_set_prio_tc_map(dev, i, 0);
+		}
+	}
+}
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1614,6 +1653,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 
 		if (txq < dev->real_num_tx_queues)
 			qdisc_reset_all_tx_gt(dev, txq);
+
+		if (dev->num_tc)
+			netif_setup_tc(dev, txq);
 	}
 
 	dev->real_num_tx_queues = txq;
@@ -2165,6 +2207,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		  unsigned int num_tx_queues)
 {
 	u32 hash;
+	u16 qoffset = 0;
+	u16 qcount = num_tx_queues;
 
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
@@ -2173,13 +2217,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		return hash;
 	}
 
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		qoffset = dev->tc_to_txq[tc].offset;
+		qcount = dev->tc_to_txq[tc].count;
+	}
+
 	if (skb->sk && skb->sk->sk_hash)
 		hash = skb->sk->sk_hash;
 	else
 		hash = (__force u16) skb->protocol ^ skb->rxhash;
 	hash = jhash_1word(hash, hashrnd);
 
-	return (u16) (((u64) hash * num_tx_queues) >> 32);
+	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox