netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] bonding: better transmit hash
@ 2010-02-03 19:13 Stephen Hemminger
  2010-02-03 20:09 ` Jay Vosburgh
  2010-02-04  9:26 ` Jasper Spaans
  0 siblings, 2 replies; 3+ messages in thread
From: Stephen Hemminger @ 2010-02-03 19:13 UTC (permalink / raw)
  To: Jay Vosburgh, David Miller, Jiri Pirko; +Cc: bonding-devel, netdev

This is a prototype of improved bonding link hashing. It adds a couple
of things:
   * support IPV6 addresses for L3/L4
   * support other protocols beside TCP/UDP
   * use all of mac address (not just last byte)
   * use jhash for better mixing
   * use skb header field access to handle vlan's etc properly

It no longer is a pure xor, does that matter?


--- a/drivers/net/bonding/bond_main.c	2010-02-03 10:42:50.998328499 -0800
+++ b/drivers/net/bonding/bond_main.c	2010-02-03 11:08:35.034851960 -0800
@@ -3587,17 +3587,28 @@ void bond_unregister_arp(struct bonding 
  * Hash for the output device based upon layer 2 and layer 3 data. If
  * the packet is not IP mimic bond_xmit_hash_policy_l2()
  */
-static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
+static int bond_xmit_hash_policy_l23(const struct sk_buff *skb, int count)
 {
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
+	u32 h;
 
-	if (skb->protocol == htons(ETH_P_IP)) {
-		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
-			(data->h_dest[5] ^ data->h_source[5])) % count;
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+	{
+		const struct iphdr *iph = ip_hdr(skb);
+		h = iph->daddr ^ iph->saddr ^ iph->protocol;
+		break;
+	}
+	case htons(ETH_P_IPV6):
+	{
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		h = iph->saddr.s6_addr32[3] ^ iph->daddr.s6_addr32[3];
+		break;
+	}
+	default:
+		h = skb->protocol;
 	}
 
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+	return jhash(eth_hdr(skb), 2*ETH_ALEN, h) % count;
 }
 
 /*
@@ -3605,35 +3616,55 @@ static int bond_xmit_hash_policy_l23(str
  * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
  * altogether not IP, mimic bond_xmit_hash_policy_l2()
  */
-static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
+static int bond_xmit_hash_policy_l34(const struct sk_buff *skb, int count)
 {
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
-	int layer4_xor = 0;
+	u32 h;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+	{
+		const struct iphdr *iph = ip_hdr(skb);
+		h = iph->saddr ^ iph->daddr;
 
-	if (skb->protocol == htons(ETH_P_IP)) {
-		if (!(iph->frag_off & htons(IP_MF|IP_OFFSET)) &&
+		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 		    (iph->protocol == IPPROTO_TCP ||
-		     iph->protocol == IPPROTO_UDP)) {
-			layer4_xor = ntohs((*layer4hdr ^ *(layer4hdr + 1)));
-		}
-		return (layer4_xor ^
-			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
+		     iph->protocol == IPPROTO_UDP ||
+		     iph->protocol == IPPROTO_UDPLITE ||
+		     iph->protocol == IPPROTO_SCTP ||
+		     iph->protocol == IPPROTO_DCCP ||
+		     iph->protocol == IPPROTO_ESP))
+			h ^= *(((u32*)iph) + iph->ihl);
 
+		break;
+	}
+	case htons(ETH_P_IPV6):
+	{
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		h = iph->daddr.s6_addr32[3] ^
+		        iph->saddr.s6_addr32[3] ^ iph->nexthdr;
+		if (iph->nexthdr == IPPROTO_TCP ||
+		    iph->nexthdr == IPPROTO_UDP ||
+		    iph->nexthdr == IPPROTO_UDPLITE ||
+		    iph->nexthdr == IPPROTO_SCTP ||
+		    iph->nexthdr == IPPROTO_DCCP ||
+		    iph->nexthdr == IPPROTO_ESP)
+			h ^= *(u32*)&iph[1];
+		break;
+	}
+	default:
+		h = ntohs(skb->protocol);
 	}
 
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+	return jhash(eth_hdr(skb), 2*ETH_ALEN, h) % count;
 }
 
 /*
  * Hash for the output device based upon layer 2 data
  */
-static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
+static int bond_xmit_hash_policy_l2(const struct sk_buff *skb, int count)
 {
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+	return jhash(eth_hdr(skb), 2*ETH_ALEN,
+		     ntohs(skb->protocol)) % count;
 }
 
 /*-------------------------- Device entry points ----------------------------*/
--- a/drivers/net/bonding/bonding.h	2010-02-03 11:07:43.694540137 -0800
+++ b/drivers/net/bonding/bonding.h	2010-02-03 11:07:59.294853950 -0800
@@ -204,7 +204,7 @@ struct bonding {
 #endif /* CONFIG_PROC_FS */
 	struct   list_head bond_list;
 	struct   dev_mc_list *mc_list;
-	int      (*xmit_hash_policy)(struct sk_buff *, int);
+	int      (*xmit_hash_policy)(const struct sk_buff *, int);
 	__be32   master_ip;
 	u16      flags;
 	u16      rr_tx_counter;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] bonding: better transmit hash
  2010-02-03 19:13 [RFC] bonding: better transmit hash Stephen Hemminger
@ 2010-02-03 20:09 ` Jay Vosburgh
  2010-02-04  9:26 ` Jasper Spaans
  1 sibling, 0 replies; 3+ messages in thread
From: Jay Vosburgh @ 2010-02-03 20:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, Jiri Pirko, bonding-devel, netdev

Stephen Hemminger <shemminger@vyatta.com> wrote:

>This is a prototype of improved bonding link hashing. It adds a couple
>of things:
>   * support IPV6 addresses for L3/L4
>   * support other protocols beside TCP/UDP
>   * use all of mac address (not just last byte)
>   * use jhash for better mixing
>   * use skb header field access to handle vlan's etc properly
>
>It no longer is a pure xor, does that matter?

	Maybe.  

	The layer3+4 algorithm in particular was set up to purposely
mimic the algorithm on specific switches.  I'm not sure if there are
users who have dependencies on any of the algorithms being a precise
match for a switch's algorithm (in order to have both directions of a
given flow end up on the same slave).

	Will the jhash preserve 802.3ad packet ordering requirements for
the layer2 and layer2+3 options (this is the property that all packets
for a given flow will be sent over the same slave)?

	How does the computational expense for the jhash compare to the
existing XOR stuff?

	I also noted one line, below, that removes whitespace.

	-J


>--- a/drivers/net/bonding/bond_main.c	2010-02-03 10:42:50.998328499 -0800
>+++ b/drivers/net/bonding/bond_main.c	2010-02-03 11:08:35.034851960 -0800
>@@ -3587,17 +3587,28 @@ void bond_unregister_arp(struct bonding 
>  * Hash for the output device based upon layer 2 and layer 3 data. If
>  * the packet is not IP mimic bond_xmit_hash_policy_l2()
>  */
>-static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
>+static int bond_xmit_hash_policy_l23(const struct sk_buff *skb, int count)
> {
>-	struct ethhdr *data = (struct ethhdr *)skb->data;
>-	struct iphdr *iph = ip_hdr(skb);
>+	u32 h;
>
>-	if (skb->protocol == htons(ETH_P_IP)) {
>-		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
>-			(data->h_dest[5] ^ data->h_source[5])) % count;
>+	switch (skb->protocol) {
>+	case htons(ETH_P_IP):
>+	{
>+		const struct iphdr *iph = ip_hdr(skb);
>+		h = iph->daddr ^ iph->saddr ^ iph->protocol;
>+		break;
>+	}
>+	case htons(ETH_P_IPV6):
>+	{
>+		const struct ipv6hdr *iph = ipv6_hdr(skb);
>+		h = iph->saddr.s6_addr32[3] ^ iph->daddr.s6_addr32[3];
>+		break;
>+	}
>+	default:
>+		h = skb->protocol;
> 	}
>
>-	return (data->h_dest[5] ^ data->h_source[5]) % count;
>+	return jhash(eth_hdr(skb), 2*ETH_ALEN, h) % count;
> }
>
> /*
>@@ -3605,35 +3616,55 @@ static int bond_xmit_hash_policy_l23(str
>  * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
>  * altogether not IP, mimic bond_xmit_hash_policy_l2()
>  */
>-static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
>+static int bond_xmit_hash_policy_l34(const struct sk_buff *skb, int count)
> {
>-	struct ethhdr *data = (struct ethhdr *)skb->data;
>-	struct iphdr *iph = ip_hdr(skb);
>-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
>-	int layer4_xor = 0;
>+	u32 h;
>+
>+	switch (skb->protocol) {
>+	case htons(ETH_P_IP):
>+	{
>+		const struct iphdr *iph = ip_hdr(skb);
>+		h = iph->saddr ^ iph->daddr;
>
>-	if (skb->protocol == htons(ETH_P_IP)) {
>-		if (!(iph->frag_off & htons(IP_MF|IP_OFFSET)) &&
>+		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&

	Why squeeze out the spaces here?

> 		    (iph->protocol == IPPROTO_TCP ||
>-		     iph->protocol == IPPROTO_UDP)) {
>-			layer4_xor = ntohs((*layer4hdr ^ *(layer4hdr + 1)));
>-		}
>-		return (layer4_xor ^
>-			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
>+		     iph->protocol == IPPROTO_UDP ||
>+		     iph->protocol == IPPROTO_UDPLITE ||
>+		     iph->protocol == IPPROTO_SCTP ||
>+		     iph->protocol == IPPROTO_DCCP ||
>+		     iph->protocol == IPPROTO_ESP))
>+			h ^= *(((u32*)iph) + iph->ihl);
>
>+		break;
>+	}
>+	case htons(ETH_P_IPV6):
>+	{
>+		const struct ipv6hdr *iph = ipv6_hdr(skb);
>+		h = iph->daddr.s6_addr32[3] ^
>+		        iph->saddr.s6_addr32[3] ^ iph->nexthdr;
>+		if (iph->nexthdr == IPPROTO_TCP ||
>+		    iph->nexthdr == IPPROTO_UDP ||
>+		    iph->nexthdr == IPPROTO_UDPLITE ||
>+		    iph->nexthdr == IPPROTO_SCTP ||
>+		    iph->nexthdr == IPPROTO_DCCP ||
>+		    iph->nexthdr == IPPROTO_ESP)
>+			h ^= *(u32*)&iph[1];
>+		break;
>+	}
>+	default:
>+		h = ntohs(skb->protocol);
> 	}
>
>-	return (data->h_dest[5] ^ data->h_source[5]) % count;
>+	return jhash(eth_hdr(skb), 2*ETH_ALEN, h) % count;
> }
>
> /*
>  * Hash for the output device based upon layer 2 data
>  */
>-static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
>+static int bond_xmit_hash_policy_l2(const struct sk_buff *skb, int count)
> {
>-	struct ethhdr *data = (struct ethhdr *)skb->data;
>-
>-	return (data->h_dest[5] ^ data->h_source[5]) % count;
>+	return jhash(eth_hdr(skb), 2*ETH_ALEN,
>+		     ntohs(skb->protocol)) % count;
> }
>
> /*-------------------------- Device entry points ----------------------------*/
>--- a/drivers/net/bonding/bonding.h	2010-02-03 11:07:43.694540137 -0800
>+++ b/drivers/net/bonding/bonding.h	2010-02-03 11:07:59.294853950 -0800
>@@ -204,7 +204,7 @@ struct bonding {
> #endif /* CONFIG_PROC_FS */
> 	struct   list_head bond_list;
> 	struct   dev_mc_list *mc_list;
>-	int      (*xmit_hash_policy)(struct sk_buff *, int);
>+	int      (*xmit_hash_policy)(const struct sk_buff *, int);
> 	__be32   master_ip;
> 	u16      flags;
> 	u16      rr_tx_counter;
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC] bonding: better transmit hash
  2010-02-03 19:13 [RFC] bonding: better transmit hash Stephen Hemminger
  2010-02-03 20:09 ` Jay Vosburgh
@ 2010-02-04  9:26 ` Jasper Spaans
  1 sibling, 0 replies; 3+ messages in thread
From: Jasper Spaans @ 2010-02-04  9:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jay Vosburgh, David Miller, Jiri Pirko,
	bonding-devel@lists.sourceforge.net, netdev@vger.kernel.org

On 03/02/10 19:13, Stephen Hemminger wrote:
> This is a prototype of improved bonding link hashing. It adds a couple
> of things:
>    * support IPV6 addresses for L3/L4
>    * support other protocols beside TCP/UDP
>    * use all of mac address (not just last byte)
>    * use jhash for better mixing
>    * use skb header field access to handle vlan's etc properly
>
> It no longer is a pure xor, does that matter?
>   
The goal is to distribute packets when load is high - so a bit of
efficiency would be nice. Looking at jhash, it seems to use ~60
operations per round of hashing instead of just 1 xor.
Besides, it seems jhash (the generic function) hashes your data in
chunks of 12 bytes, and finishes of with another round hashing. You
might want to use the specialized jhash_3words function in those cases,
which only does one round.
> --- a/drivers/net/bonding/bond_main.c	2010-02-03 10:42:50.998328499 -0800
> +++ b/drivers/net/bonding/bond_main.c	2010-02-03 11:08:35.034851960 -0800
> @@ -3587,17 +3587,28 @@ void bond_unregister_arp(struct bonding 
>   * Hash for the output device based upon layer 2 and layer 3 data. If
>   * the packet is not IP mimic bond_xmit_hash_policy_l2()
>   */
> -static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
> +static int bond_xmit_hash_policy_l23(const struct sk_buff *skb, int count)
>  {
> -	struct ethhdr *data = (struct ethhdr *)skb->data;
> -	struct iphdr *iph = ip_hdr(skb);
> +	u32 h;
>  
> -	if (skb->protocol == htons(ETH_P_IP)) {
> -		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
> -			(data->h_dest[5] ^ data->h_source[5])) % count;
> +	switch (skb->protocol) {
> +	case htons(ETH_P_IP):
> +	{
> +		const struct iphdr *iph = ip_hdr(skb);
> +		h = iph->daddr ^ iph->saddr ^ iph->protocol;
> +		break;
> +	}
> +	case htons(ETH_P_IPV6):
> +	{
> +		const struct ipv6hdr *iph = ipv6_hdr(skb);
> +		h = iph->saddr.s6_addr32[3] ^ iph->daddr.s6_addr32[3];
> +		break;
> +	}
> +	default:
> +		h = skb->protocol;
>  	}
>  
> -	return (data->h_dest[5] ^ data->h_source[5]) % count;
> +	return jhash(eth_hdr(skb), 2*ETH_ALEN, h) % count;
>  }
>   
This breaks stuff - in the layer 3 case, you're suddenly putting layer 2
data into the hash, and in the layer 2 case, you're putting the protocol
into the hash. Especially the first case has the potential to break our
setup, in which related traffic might end up at different interfaces.

>  /*
> @@ -3605,35 +3616,55 @@ static int bond_xmit_hash_policy_l23(str
>   * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
>   * altogether not IP, mimic bond_xmit_hash_policy_l2()
>   */
> -static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
> +static int bond_xmit_hash_policy_l34(const struct sk_buff *skb, int count)
>  {
> -	struct ethhdr *data = (struct ethhdr *)skb->data;
> -	struct iphdr *iph = ip_hdr(skb);
> -	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
> -	int layer4_xor = 0;
> +	u32 h;
> +
> +	switch (skb->protocol) {
> +	case htons(ETH_P_IP):
> +	{
> +		const struct iphdr *iph = ip_hdr(skb);
> +		h = iph->saddr ^ iph->daddr;
>  
> -	if (skb->protocol == htons(ETH_P_IP)) {
> -		if (!(iph->frag_off & htons(IP_MF|IP_OFFSET)) &&
> +		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
>  		    (iph->protocol == IPPROTO_TCP ||
> -		     iph->protocol == IPPROTO_UDP)) {
> -			layer4_xor = ntohs((*layer4hdr ^ *(layer4hdr + 1)));
> -		}
> -		return (layer4_xor ^
> -			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
> +		     iph->protocol == IPPROTO_UDP ||
> +		     iph->protocol == IPPROTO_UDPLITE ||
> +		     iph->protocol == IPPROTO_SCTP ||
> +		     iph->protocol == IPPROTO_DCCP ||
> +		     iph->protocol == IPPROTO_ESP))
> +			h ^= *(((u32*)iph) + iph->ihl);
>  
> +		break;
> +	}
> +	case htons(ETH_P_IPV6):
> +	{
> +		const struct ipv6hdr *iph = ipv6_hdr(skb);
> +		h = iph->daddr.s6_addr32[3] ^
> +		        iph->saddr.s6_addr32[3] ^ iph->nexthdr;
> +		if (iph->nexthdr == IPPROTO_TCP ||
> +		    iph->nexthdr == IPPROTO_UDP ||
> +		    iph->nexthdr == IPPROTO_UDPLITE ||
> +		    iph->nexthdr == IPPROTO_SCTP ||
> +		    iph->nexthdr == IPPROTO_DCCP ||
> +		    iph->nexthdr == IPPROTO_ESP)
> +			h ^= *(u32*)&iph[1];
> +		break;
> +	}
> +	default:
> +		h = ntohs(skb->protocol);
>  	}
>  
> -	return (data->h_dest[5] ^ data->h_source[5]) % count;
> +	return jhash(eth_hdr(skb), 2*ETH_ALEN, h) % count;
>  }
>   
I like the support for ipv6 and more protocols - but again, why the
skb->protocol as initializer? Also, why mix in iph->nexthdr into h in
the ipv6 case?

>  
>  /*
>   * Hash for the output device based upon layer 2 data
>   */
> -static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
> +static int bond_xmit_hash_policy_l2(const struct sk_buff *skb, int count)
>  {
> -	struct ethhdr *data = (struct ethhdr *)skb->data;
> -
> -	return (data->h_dest[5] ^ data->h_source[5]) % count;
> +	return jhash(eth_hdr(skb), 2*ETH_ALEN,
> +		     ntohs(skb->protocol)) % count;
>  } 
>   
This one also needlessly incorporates the protocol into the hash.

On a meta-level, do you have any measurements showing biased output for
real traffic? My experience is that the normal xor code just works fine,
except for one specific setup in which the traffic itself is rather
biased (tons of ipsec between just two hosts + less normal traffic
between the rest of the network). In that case, it will also be
difficult to distribute this load even using the above changes.

Jasper

-- 
Ir. Jasper Spaans
Fox-IT Experts in IT Security!
T: +31 (0) 15 284 79 99
KvK Haaglanden 27301624



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-02-04  9:47 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-03 19:13 [RFC] bonding: better transmit hash Stephen Hemminger
2010-02-03 20:09 ` Jay Vosburgh
2010-02-04  9:26 ` Jasper Spaans

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).