Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] bridge: add a missing ntohs()
From: Herbert Xu @ 2010-04-20 13:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1271769605.7895.14.camel@edumazet-laptop>

On Tue, Apr 20, 2010 at 03:20:05PM +0200, Eric Dumazet wrote:
> Found by code review and sparse endianness warning, I am not sure this
> patch is valid.
> 
> Thanks
> 
> [PATCH] bridge: add a missing ntohs()
> 
> grec_nsrcs is in network order, we should convert to host horder in
> br_multicast_igmp3_report()
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>

Thanks Eric!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH] bridge: add a missing ntohs()
From: Eric Dumazet @ 2010-04-20 13:20 UTC (permalink / raw)
  To: Herbert Xu, David Miller; +Cc: netdev

Found by code review and sparse endianness warning, I am not sure this
patch is valid.

Thanks

[PATCH] bridge: add a missing ntohs()

grec_nsrcs is in network order, we should convert to host horder in
br_multicast_igmp3_report()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f29ada8..386c153 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -727,7 +727,7 @@ static int br_multicast_igmp3_report(struct net_bridge *br,
 		group = grec->grec_mca;
 		type = grec->grec_type;
 
-		len += grec->grec_nsrcs * 4;
+		len += ntohs(grec->grec_nsrcs) * 4;
 		if (!pskb_may_pull(skb, len))
 			return -EINVAL;
 




^ permalink raw reply related

* [PATCH net-next-2.6] net: Fix various endianness glitches
From: Eric Dumazet @ 2010-04-20 13:18 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Sparse can help us find endianness bugs, but we need to make some
cleanups to be able to more easily spot real bugs.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/bridge/br_multicast.c |    2 +-
net/bridge/br_private.h   |   15 ++++++++-------
net/ethernet/eth.c        |    2 +-
net/ipv4/af_inet.c        |    8 ++++----
net/ipv4/ipmr.c           |   10 +++++-----
net/ipv4/route.c          |   29 ++++++++++++++---------------
net/ipv4/tcp.c            |   15 ++++++++-------
net/ipv4/tcp_ipv4.c       |    4 ++--
net/ipv4/tcp_output.c     |    4 ++--
net/ipv4/udp.c            |    8 ++++----
net/ipv6/addrconf.c       |    3 ++-
net/ipv6/ip6_fib.c        |    3 ++-
net/ipv6/tcp_ipv6.c       |    4 ++--
net/ipv6/udp.c            |    4 ++--
net/sched/sch_sfq.c       |   10 +++++-----
net/sunrpc/xprt.c         |    2 +-
net/xfrm/xfrm_hash.h      |    3 ++-
17 files changed, 65 insertions(+), 61 deletions(-) 

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3fe86ff..61e1d10 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -29,7 +29,7 @@
 
 static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb, __be32 ip)
 {
-	return jhash_1word(mdb->secret, (u32)ip) & (mdb->max - 1);
+	return jhash_1word(mdb->secret, (__force u32)ip) & (mdb->max - 1);
 }
 
 static struct net_bridge_mdb_entry *__br_mdb_ip_get(
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 791d4ab..63181e4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -130,19 +130,20 @@ struct net_bridge_port
 #endif
 };
 
+struct br_cpu_netstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
 struct net_bridge
 {
 	spinlock_t			lock;
 	struct list_head		port_list;
 	struct net_device		*dev;
 
-	struct br_cpu_netstats __percpu {
-		unsigned long	rx_packets;
-		unsigned long	rx_bytes;
-		unsigned long	tx_packets;
-		unsigned long	tx_bytes;
-	} *stats;
-
+	struct br_cpu_netstats __percpu *stats;
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 	unsigned long			feature_mask;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 205a1c1..3584696 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -136,7 +136,7 @@ int eth_rebuild_header(struct sk_buff *skb)
 	default:
 		printk(KERN_DEBUG
 		       "%s: unable to resolve type %X addresses.\n",
-		       dev->name, (int)eth->h_proto);
+		       dev->name, (__force int)eth->h_proto);
 
 		memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
 		break;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c5376c7..cb0de78 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1323,8 +1323,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
 		goto out_unlock;
 
-	id = ntohl(*(u32 *)&iph->id);
-	flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+	id = ntohl(*(__be32 *)&iph->id);
+	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
 	id >>= 16;
 
 	for (p = *head; p; p = p->next) {
@@ -1337,8 +1337,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
 
 		if ((iph->protocol ^ iph2->protocol) |
 		    (iph->tos ^ iph2->tos) |
-		    (iph->saddr ^ iph2->saddr) |
-		    (iph->daddr ^ iph2->daddr)) {
+		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7d8a2bc..a2df501 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1772,10 +1772,10 @@ int ip_mr_input(struct sk_buff *skb)
 
 		vif = ipmr_find_vif(mrt, skb->dev);
 		if (vif >= 0) {
-			int err = ipmr_cache_unresolved(mrt, vif, skb);
+			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
 			read_unlock(&mrt_lock);
 
-			return err;
+			return err2;
 		}
 		read_unlock(&mrt_lock);
 		kfree_skb(skb);
@@ -2227,9 +2227,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 		const struct ipmr_mfc_iter *it = seq->private;
 		const struct mr_table *mrt = it->mrt;
 
-		seq_printf(seq, "%08lX %08lX %-3hd",
-			   (unsigned long) mfc->mfc_mcastgrp,
-			   (unsigned long) mfc->mfc_origin,
+		seq_printf(seq, "%08X %08X %-3hd",
+			   (__force u32) mfc->mfc_mcastgrp,
+			   (__force u32) mfc->mfc_origin,
 			   mfc->mfc_parent);
 
 		if (it->cache != &mrt->mfc_unres_queue) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cb562fd..a947428 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -258,10 +258,9 @@ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 	(__raw_get_cpu_var(rt_cache_stat).field++)
 
 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-		int genid)
+				   int genid)
 {
-	return jhash_3words((__force u32)(__be32)(daddr),
-			    (__force u32)(__be32)(saddr),
+	return jhash_3words((__force u32)daddr, (__force u32)saddr,
 			    idx, genid)
 		& rt_hash_mask;
 }
@@ -378,12 +377,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 		struct rtable *r = v;
 		int len;
 
-		seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
-			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
+			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 			r->u.dst.dev ? r->u.dst.dev->name : "*",
-			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
+			(__force u32)r->rt_dst,
+			(__force u32)r->rt_gateway,
 			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
-			r->u.dst.__use, 0, (unsigned long)r->rt_src,
+			r->u.dst.__use, 0, (__force u32)r->rt_src,
 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 			dst_metric(&r->u.dst, RTAX_WINDOW),
@@ -685,18 +685,17 @@ static inline bool rt_caching(const struct net *net)
 static inline bool compare_hash_inputs(const struct flowi *fl1,
 					const struct flowi *fl2)
 {
-	return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
-		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
+	return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
+		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 		(fl1->iif ^ fl2->iif)) == 0);
 }
 
 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 {
-	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
-		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
+	return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
+		((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 		(fl1->mark ^ fl2->mark) |
-		(*(u16 *)&fl1->nl_u.ip4_u.tos ^
-		 *(u16 *)&fl2->nl_u.ip4_u.tos) |
+		(*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
 		(fl1->oif ^ fl2->oif) |
 		(fl1->iif ^ fl2->iif)) == 0;
 }
@@ -2319,8 +2318,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_lock();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
-		if (((rth->fl.fl4_dst ^ daddr) |
-		     (rth->fl.fl4_src ^ saddr) |
+		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
+		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
 		     (rth->fl.iif ^ iif) |
 		     rth->fl.oif |
 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0f8caf6..d3ad2fa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2721,7 +2721,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	struct tcphdr *th2;
 	unsigned int len;
 	unsigned int thlen;
-	unsigned int flags;
+	__be32 flags;
 	unsigned int mss = 1;
 	unsigned int hlen;
 	unsigned int off;
@@ -2771,10 +2771,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 found:
 	flush = NAPI_GRO_CB(p)->flush;
-	flush |= flags & TCP_FLAG_CWR;
-	flush |= (flags ^ tcp_flag_word(th2)) &
-		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
-	flush |= th->ack_seq ^ th2->ack_seq;
+	flush |= (__force int)(flags & TCP_FLAG_CWR);
+	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
 	for (i = sizeof(*th); i < thlen; i += 4)
 		flush |= *(u32 *)((u8 *)th + i) ^
 			 *(u32 *)((u8 *)th2 + i);
@@ -2795,8 +2795,9 @@ found:
 
 out_check_final:
 	flush = len < mss;
-	flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
-			  TCP_FLAG_SYN | TCP_FLAG_FIN);
+	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+					TCP_FLAG_RST | TCP_FLAG_SYN |
+					TCP_FLAG_FIN));
 
 	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
 		pp = head;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad08392..4d6717d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1286,8 +1286,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			goto drop_and_release;
 
 		/* Secret recipe starts with IP addresses */
-		*mess++ ^= daddr;
-		*mess++ ^= saddr;
+		*mess++ ^= (__force u32)daddr;
+		*mess++ ^= (__force u32)saddr;
 
 		/* plus variable length Initiator Cookie */
 		c = (u8 *)mess;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2b7d71f..429ad92 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -861,7 +861,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 			th->urg_ptr = htons(tp->snd_up - tcb->seq);
 			th->urg = 1;
 		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
-			th->urg_ptr = 0xFFFF;
+			th->urg_ptr = htons(0xFFFF);
 			th->urg = 1;
 		}
 	}
@@ -2485,7 +2485,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 			*tail-- ^= TCP_SKB_CB(skb)->seq + 1;
 
 			/* recommended */
-			*tail-- ^= ((th->dest << 16) | th->source);
+			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
 			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
 
 			sha_transform((__u32 *)&xvp->cookie_bakery[0],
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 666b963..1e18f9c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -307,13 +307,13 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
 static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
 				       unsigned int port)
 {
-	return jhash_1word(saddr, net_hash_mix(net)) ^ port;
+	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
 }
 
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
 	unsigned int hash2_nulladdr =
-		udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
+		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
 	unsigned int hash2_partial =
 		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
 
@@ -466,14 +466,14 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 					  daddr, hnum, dif,
 					  hslot2, slot2);
 		if (!result) {
-			hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
+			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
 			slot2 = hash2 & udptable->mask;
 			hslot2 = &udptable->hash2[slot2];
 			if (hslot->count < hslot2->count)
 				goto begin;
 
 			result = udp4_lib_lookup2(net, saddr, sport,
-						  INADDR_ANY, hnum, dif,
+						  htonl(INADDR_ANY), hnum, dif,
 						  hslot2, slot2);
 		}
 		rcu_read_unlock();
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7cba884..34d2d64 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -588,7 +588,8 @@ static u32 ipv6_addr_hash(const struct in6_addr *addr)
 	 * We perform the hash function over the last 64 bits of the address
 	 * This will include the IEEE address token on links that support it.
 	 */
-	return jhash_2words(addr->s6_addr32[2],  addr->s6_addr32[3], 0)
+	return jhash_2words((__force u32)addr->s6_addr32[2],
+			    (__force u32)addr->s6_addr32[3], 0)
 		& (IN6_ADDR_HSIZE - 1);
 }
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index dc6e0b8..92a122b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -144,7 +144,8 @@ static __inline__ __be32 addr_bit_set(void *token, int fn_bit)
 	 *	htonl(1 << ((~fn_bit)&0x1F))
 	 * See include/asm-generic/bitops/le.h.
 	 */
-	return (1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5];
+	return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
+	       addr[fn_bit >> 5];
 }
 
 static __inline__ struct fib6_node * node_alloc(void)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bd5ef7b..a92b4a5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1234,12 +1234,12 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 			goto drop_and_free;
 
 		/* Secret recipe starts with IP addresses */
-		d = &ipv6_hdr(skb)->daddr.s6_addr32[0];
+		d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0];
 		*mess++ ^= *d++;
 		*mess++ ^= *d++;
 		*mess++ ^= *d++;
 		*mess++ ^= *d++;
-		d = &ipv6_hdr(skb)->saddr.s6_addr32[0];
+		d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0];
 		*mess++ ^= *d++;
 		*mess++ ^= *d++;
 		*mess++ ^= *d++;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9082485..92bf903 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -91,9 +91,9 @@ static unsigned int udp6_portaddr_hash(struct net *net,
 	if (ipv6_addr_any(addr6))
 		hash = jhash_1word(0, mix);
 	else if (ipv6_addr_v4mapped(addr6))
-		hash = jhash_1word(addr6->s6_addr32[3], mix);
+		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
 	else
-		hash = jhash2(addr6->s6_addr32, 4, mix);
+		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
 
 	return hash ^ port;
 }
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index c5a9ac5..c657628 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -123,8 +123,8 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 	case htons(ETH_P_IP):
 	{
 		const struct iphdr *iph = ip_hdr(skb);
-		h = iph->daddr;
-		h2 = iph->saddr ^ iph->protocol;
+		h = (__force u32)iph->daddr;
+		h2 = (__force u32)iph->saddr ^ iph->protocol;
 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 		    (iph->protocol == IPPROTO_TCP ||
 		     iph->protocol == IPPROTO_UDP ||
@@ -138,8 +138,8 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 	case htons(ETH_P_IPV6):
 	{
 		struct ipv6hdr *iph = ipv6_hdr(skb);
-		h = iph->daddr.s6_addr32[3];
-		h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
+		h = (__force u32)iph->daddr.s6_addr32[3];
+		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 		if (iph->nexthdr == IPPROTO_TCP ||
 		    iph->nexthdr == IPPROTO_UDP ||
 		    iph->nexthdr == IPPROTO_UDPLITE ||
@@ -150,7 +150,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 		break;
 	}
 	default:
-		h = (unsigned long)skb_dst(skb) ^ skb->protocol;
+		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 		h2 = (unsigned long)skb->sk;
 	}
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 42f09ad..699ade6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -974,7 +974,7 @@ void xprt_reserve(struct rpc_task *task)
 
 static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
 {
-	return xprt->xid++;
+	return (__force __be32)xprt->xid++;
 }
 
 static inline void xprt_init_xid(struct rpc_xprt *xprt)
diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index e5195c9..1396572 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -16,7 +16,8 @@ static inline unsigned int __xfrm6_addr_hash(xfrm_address_t *addr)
 
 static inline unsigned int __xfrm4_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr)
 {
-	return ntohl(daddr->a4 + saddr->a4);
+	u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
+	return ntohl((__force __be32)sum);
 }
 
 static inline unsigned int __xfrm6_daddr_saddr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr)




^ permalink raw reply related

* Re: [PATCH net-next-2.6] rps: consistent rxhash
From: Eric Dumazet @ 2010-04-20 13:16 UTC (permalink / raw)
  To: Franco Fichtner; +Cc: Changli Gao, David Miller, therbert, netdev
In-Reply-To: <4BCDA2B5.4060609@lastsummer.de>

Le mardi 20 avril 2010 à 14:48 +0200, Franco Fichtner a écrit :

> 
> I thought about this for some time...
> 
> Do we really need the port numbers here at all? A simple
> addr1^addr2 can provide a good enough pointer for
> distribution amongst CPUs.
> 
> The real connection tracking is better done locally at the
> corresponding CPU. That way a potential cache miss can be
> avoided and the still needed hash calculation for
> connection tracking will be offloaded.
> 

Yes, doing the port test/swap is useful in the loopback case 
(addr1 == addr2).

This is probably a bit convoluted, but David (and me) found this
funny ;)



^ permalink raw reply

* Re: rps perfomance WAS(Re: rps: question
From: Eric Dumazet @ 2010-04-20 13:13 UTC (permalink / raw)
  To: hadi; +Cc: Changli Gao, Rick Jones, David Miller, therbert, netdev, robert,
	andi
In-Reply-To: <1271764941.3735.94.camel@bigi>

Le mardi 20 avril 2010 à 08:02 -0400, jamal a écrit : 
> folks,
> 
> Thanks to everybody (Eric stands out) for your patience. 
> I ended mostly validating whats already been said. I have a lot of data
> and can describe in details how i tested etc but it would require
> patience in reading, so i will spare you;-> If you are interested let me
> know and i will be happy to share.
> 
> Summary is: 
> -rps good, gives higher throughput for apps
> -rps not so good, latency worse but gets better with higher input rate
> or increasing number of flows (which translates to higher pps)
> -rps works well with newer hardware that has better cache structures.
> [Gives great results on my test machine a Nehalem single processor, 4
> cores each with two SMT threads that has a shared L2 between threads and
> a shared L3 between cores]. 
> Your selection of what the demux cpu is and where the target cpus are is
> an influencing factor in the latency results. If you have a system with
> multiple sockets, you should get better numbers if you stay within the
> same socket relative to going across sockets.
> -rps does a better job at helping schedule apps on same cpu thus
> localizing the app. The throughput results with rps are very consistent
> and better whereas in non-rps case, variance is _high_.
> 
> My next step is to do some forwarding tests - probably next week. I am
> concerned here because i expect the cache misses to be higher than the
> app scenario (netdev structure and attributes could be touched by many
> cpus)
> 

Hi Jamal

I think your tests are very interesting, maybe could you publish them
somehow ? (I forgot to thank you about the previous report and nice
graph)

perf reports would be good too to help to spot hot points.




^ permalink raw reply

* Re: [PATCH net-next-2.6] rps: consistent rxhash
From: Franco Fichtner @ 2010-04-20 12:48 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Changli Gao, David Miller, therbert, netdev
In-Reply-To: <1271750198.3845.216.camel@edumazet-laptop>

Eric Dumazet wrote:
> In case we compute a software skb->rxhash, we can generate a consistent
> hash : Its value will be the same in both flow directions.
>
> This helps some workloads, like conntracking, since the same state needs
> to be accessed in both directions.
>
> tbench + RFS + this patch gives better results than tbench with default
> kernel configuration (no RPS, no RFS)
>
> Also fixed some sparse warnings.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---

I thought about this for some time...

Do we really need the port numbers here at all? A simple
addr1^addr2 can provide a good enough pointer for
distribution amongst CPUs.

The real connection tracking is better done locally at the
corresponding CPU. That way a potential cache miss can be
avoided and the still needed hash calculation for
connection tracking will be offloaded.


Franco


^ permalink raw reply

* [PATCH] net: ipv6 bind to device issue
From: Jiri Olsa @ 2010-04-20 12:46 UTC (permalink / raw)
  To: davem, kuznet, pekkas, jmorris, yoshfuji, kaber, eric.dumazet
  Cc: netdev, Jiri Olsa

hi,

The issue raises when having 2 NICs both assigned the same
IPv6 global address.

If a sender binds to a particular NIC (SO_BINDTODEVICE),
the outgoing traffic is being sent via the first found.
The bonded device is thus not taken into an account during the
routing.


>From the ip6_route_output function:

If the binding address is multicast, linklocal or loopback,
the RT6_LOOKUP_F_IFACE bit is set, but not for global address.

So binding global address will neglect SO_BINDTODEVICE-binded device,
because the fib6_rule_lookup function path won't check for the
flowi::oif field and take first route that fits.

Following patch should handle the issue.

wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c2438e8..7bf7717 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -815,7 +815,7 @@ struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
 {
 	int flags = 0;
 
-	if (rt6_need_strict(&fl->fl6_dst))
+	if (rt6_need_strict(&fl->fl6_dst) || fl->oif)
 		flags |= RT6_LOOKUP_F_IFACE;
 
 	if (!ipv6_addr_any(&fl->fl6_src))

^ permalink raw reply related

* Re: rps perfomance WAS(Re: rps: question
From: jamal @ 2010-04-20 12:02 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, Rick Jones, David Miller, therbert, netdev, robert,
	andi
In-Reply-To: <1271590476.16881.4925.camel@edumazet-laptop>

folks,

Thanks to everybody (Eric stands out) for your patience. 
I ended mostly validating whats already been said. I have a lot of data
and can describe in details how i tested etc but it would require
patience in reading, so i will spare you;-> If you are interested let me
know and i will be happy to share.

Summary is: 
-rps good, gives higher throughput for apps
-rps not so good, latency worse but gets better with higher input rate
or increasing number of flows (which translates to higher pps)
-rps works well with newer hardware that has better cache structures.
[Gives great results on my test machine a Nehalem single processor, 4
cores each with two SMT threads that has a shared L2 between threads and
a shared L3 between cores]. 
Your selection of what the demux cpu is and where the target cpus are is
an influencing factor in the latency results. If you have a system with
multiple sockets, you should get better numbers if you stay within the
same socket relative to going across sockets.
-rps does a better job at helping schedule apps on same cpu thus
localizing the app. The throughput results with rps are very consistent
and better whereas in non-rps case, variance is _high_.

My next step is to do some forwarding tests - probably next week. I am
concerned here because i expect the cache misses to be higher than the
app scenario (netdev structure and attributes could be touched by many
cpus)

cheers,
jamal


^ permalink raw reply

* Re: A possible bug in reqsk_queue_hash_req()
From: Eric Dumazet @ 2010-04-20 11:06 UTC (permalink / raw)
  To: Li Yu; +Cc: netdev, linux-kernel
In-Reply-To: <i2x9b948ee41004200335vc229a59cvc0a08c35c949d7dd@mail.gmail.com>

Le mardi 20 avril 2010 à 18:35 +0800, Li Yu a écrit :
> Hi,
> 
>      I found out a possible bug in reqsk_queue_hash_req(), it seem
> that we should move "req->dl_next = lopt->syn_table[hash];" statement
> into follow write lock protected scope.
> 
>      As I browsed source code, this function only can be call at rx
> code path which is protected a spin lock over struct sock , but its
> caller (  inet_csk_reqsk_queue_hash_add() ) is a GPL exported symbol,
> so I think that we'd best move this statement into below write lock
> protected scope.
> 
>      Below is the patch to play this change, please do not apply it on
> source code, it's just for show.
> 
>     Thanks.
> 
> Yu
> 
> --- include/net/request_sock.h  2010-04-09 15:27:14.000000000 +0800
> +++ include/net/request_sock.h        2010-04-20 18:11:32.000000000 +0800
> @@ -247,9 +247,9 @@ static inline void reqsk_queue_hash_req(
>         req->expires = jiffies + timeout;
>         req->retrans = 0;
>         req->sk = NULL;
> -       req->dl_next = lopt->syn_table[hash];
> 
>         write_lock(&queue->syn_wait_lock);
> +       req->dl_next = lopt->syn_table[hash];
>         lopt->syn_table[hash] = req;
>         write_unlock(&queue->syn_wait_lock);
>  }

I believe its not really necessary, because we are the only possible
writer at this stage.

The write_lock() ... write_unlock() is there only to enforce a
synchronisation with readers.

All callers of this reqsk_queue_hash_req() must have the socket locked




^ permalink raw reply

* A possible bug in reqsk_queue_hash_req()
From: Li Yu @ 2010-04-20 10:35 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

Hi,

     I found out a possible bug in reqsk_queue_hash_req(), it seem
that we should move "req->dl_next = lopt->syn_table[hash];" statement
into follow write lock protected scope.

     As I browsed source code, this function only can be call at rx
code path which is protected a spin lock over struct sock , but its
caller (  inet_csk_reqsk_queue_hash_add() ) is a GPL exported symbol,
so I think that we'd best move this statement into below write lock
protected scope.

     Below is the patch to play this change, please do not apply it on
source code, it's just for show.

    Thanks.

Yu

--- include/net/request_sock.h  2010-04-09 15:27:14.000000000 +0800
+++ include/net/request_sock.h        2010-04-20 18:11:32.000000000 +0800
@@ -247,9 +247,9 @@ static inline void reqsk_queue_hash_req(
        req->expires = jiffies + timeout;
        req->retrans = 0;
        req->sk = NULL;
-       req->dl_next = lopt->syn_table[hash];

        write_lock(&queue->syn_wait_lock);
+       req->dl_next = lopt->syn_table[hash];
        lopt->syn_table[hash] = req;
        write_unlock(&queue->syn_wait_lock);
 }

^ permalink raw reply

* [PATCH] NET: Fix an RCU warning in dev_pick_tx()
From: David Howells @ 2010-04-20 10:25 UTC (permalink / raw)
  To: netdev; +Cc: dhowells

Fix the following RCU warning in dev_pick_tx():

===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
net/core/dev.c:1993 invoked rcu_dereference_check() without protection!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 0
2 locks held by swapper/0:
 #0:  (&idev->mc_ifc_timer){+.-...}, at: [<ffffffff81039e65>] run_timer_softirq+0x17b/0x278
 #1:  (rcu_read_lock_bh){.+....}, at: [<ffffffff812ea3eb>] dev_queue_xmit+0x14e/0x4dc

stack backtrace:
Pid: 0, comm: swapper Not tainted 2.6.34-rc5-cachefs #4
Call Trace:
 <IRQ>  [<ffffffff810516c4>] lockdep_rcu_dereference+0xaa/0xb2
 [<ffffffff812ea4f6>] dev_queue_xmit+0x259/0x4dc
 [<ffffffff812ea3eb>] ? dev_queue_xmit+0x14e/0x4dc
 [<ffffffff81052324>] ? trace_hardirqs_on+0xd/0xf
 [<ffffffff81035362>] ? local_bh_enable_ip+0xbc/0xc1
 [<ffffffff812f0954>] neigh_resolve_output+0x24b/0x27c
 [<ffffffff8134f673>] ip6_output_finish+0x7c/0xb4
 [<ffffffff81350c34>] ip6_output2+0x256/0x261
 [<ffffffff81052324>] ? trace_hardirqs_on+0xd/0xf
 [<ffffffff813517fb>] ip6_output+0xbbc/0xbcb
 [<ffffffff8135bc5d>] ? fib6_force_start_gc+0x2b/0x2d
 [<ffffffff81368acb>] mld_sendpack+0x273/0x39d
 [<ffffffff81368858>] ? mld_sendpack+0x0/0x39d
 [<ffffffff81052099>] ? mark_held_locks+0x52/0x70
 [<ffffffff813692fc>] mld_ifc_timer_expire+0x24f/0x288
 [<ffffffff81039ed6>] run_timer_softirq+0x1ec/0x278
 [<ffffffff81039e65>] ? run_timer_softirq+0x17b/0x278
 [<ffffffff813690ad>] ? mld_ifc_timer_expire+0x0/0x288
 [<ffffffff81035531>] ? __do_softirq+0x69/0x140
 [<ffffffff8103556a>] __do_softirq+0xa2/0x140
 [<ffffffff81002e0c>] call_softirq+0x1c/0x28
 [<ffffffff81004b54>] do_softirq+0x38/0x80
 [<ffffffff81034f06>] irq_exit+0x45/0x47
 [<ffffffff810177c3>] smp_apic_timer_interrupt+0x88/0x96
 [<ffffffff810028d3>] apic_timer_interrupt+0x13/0x20
 <EOI>  [<ffffffff810488dd>] ? __atomic_notifier_call_chain+0x0/0x86
 [<ffffffff810096bf>] ? mwait_idle+0x6e/0x78
 [<ffffffff810096b6>] ? mwait_idle+0x65/0x78
 [<ffffffff810011cb>] cpu_idle+0x4d/0x83
 [<ffffffff81380b05>] rest_init+0xb9/0xc0
 [<ffffffff81380a4c>] ? rest_init+0x0/0xc0
 [<ffffffff8168dcf0>] start_kernel+0x392/0x39d
 [<ffffffff8168d2a3>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff8168d38b>] x86_64_start_kernel+0xe4/0xeb

An rcu_dereference() should be an rcu_dereference_bh().

Signed-off-by: David Howells <dhowells@redhat.com>
---

 net/core/dev.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 92584bf..f769098 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1990,7 +1990,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 				queue_index = skb_tx_hash(dev, skb);
 
 			if (sk) {
-				struct dst_entry *dst = rcu_dereference(sk->sk_dst_cache);
+				struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache);
 
 				if (dst && skb_dst(skb) == dst)
 					sk_tx_queue_set(sk, queue_index);


^ permalink raw reply related

* [PATCH] rps: optimize rps_get_cpu()
From: Changli Gao @ 2010-04-20  9:55 UTC (permalink / raw)
  To: David S. Miller; +Cc: Tom Herbert, Eric Dumazet, netdev, Changli Gao

optimize rps_get_cpu().

don't initialize ports when we can get the ports. one memory access for ports
than two.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/core/dev.c |   24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index b31d5d6..a281727 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2225,7 +2225,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	int cpu = -1;
 	u8 ip_proto;
 	u16 tcpu;
-	u32 addr1, addr2, ports, ihl;
+	u32 addr1, addr2, ihl;
+	union {
+		u32 v32;
+		u16 v16[2];
+	} ports;
 
 	if (skb_rx_queue_recorded(skb)) {
 		u16 index = skb_get_rx_queue(skb);
@@ -2271,7 +2275,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	default:
 		goto done;
 	}
-	ports = 0;
 	switch (ip_proto) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
@@ -2281,25 +2284,20 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	case IPPROTO_SCTP:
 	case IPPROTO_UDPLITE:
 		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
-			__be16 *hports = (__be16 *) (skb->data + (ihl * 4));
-			u32 sport, dport;
-
-			sport = (__force u16) hports[0];
-			dport = (__force u16) hports[1];
-			if (dport < sport)
-				swap(sport, dport);
-			ports = (sport << 16) + dport;
+			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
+			if (ports.v16[0] < ports.v16[1])
+				swap(ports.v16[0], ports.v16[1]);
+			break;
 		}
-		break;
-
 	default:
+		ports.v32 = 0;
 		break;
 	}
 
 	/* get a consistent hash (same value on both flow directions) */
 	if (addr2 < addr1)
 		swap(addr1, addr2);
-	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
+	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
 	if (!skb->rxhash)
 		skb->rxhash = 1;
 

^ permalink raw reply related

* Re: [PATCH] rps: send IPIs ASAP
From: Changli Gao @ 2010-04-20  9:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, David S. Miller, netdev
In-Reply-To: <1271742351.3845.106.camel@edumazet-laptop>

On Tue, Apr 20, 2010 at 1:45 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le lundi 19 avril 2010 à 22:15 -0700, Tom Herbert a écrit :
>> On Mon, Apr 19, 2010 at 9:08 PM, Changli Gao <xiaosuo@gmail.com> wrote:
>> > rps: send IPIs ASAP
>> >
>> > In order to reduce latency, we'd better send IPIs ASAP to schedule the
>> > corresponding NAPIs.
>> >
>> A design point of RPS is that we generate at most one IPI per CPU per
>> device interrupt, which at least offers some predictable coalescing.
>> With your changes, we would get at most one IPI per packet-- that
>> could represent a lot more of them.  Did you test this to see what the
>> impact is in this regard?
>>
>
> I agree with you Tom. Coalescing IPI is probably better.
>
> If the receiver CPU got a single packet in its RX handling, latency will
> be the same anyway.

I did the "ping -f" test again, and found that the differences of RTT
I got before were noises. It seems your "shortcut net_rps_action()"
patch eliminates the differences.

>
> If the receiver CPU got many packets, chance is high we are in a stress
> situation, and coalescing is a win in this case.
>
> I am currently testing a patch to call net_rps_action() at the beginning
> of process_backlog() (if we have a non null ipi_rps_list pointer)
>
> Will post a patch with bench results
>

It sounds like a better idea.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: emphasize rtnl lock required in call_netdevice_notifiers
From: David Miller @ 2010-04-20  8:45 UTC (permalink / raw)
  To: jpirko; +Cc: netdev, eric.dumazet
In-Reply-To: <20100420083729.GA2810@psychotron.lab.eng.brq.redhat.com>

From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 20 Apr 2010 10:37:30 +0200

> Since netdev_chain is guarded by rtnl_lock, ASSERT_RTNL should be present here
> to make sure that all callers of call_netdevice_notifiers does the locking
> properly.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

That seems right, applied, thanks Jiri!

^ permalink raw reply

* [PATCH net-next-2.6] net: emphasize rtnl lock required in call_netdevice_notifiers
From: Jiri Pirko @ 2010-04-20  8:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet

Since netdev_chain is guarded by rtnl_lock, ASSERT_RTNL should be present here
to make sure that all callers of call_netdevice_notifiers does the locking
properly.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/net/core/dev.c b/net/core/dev.c
index 05a2b29..a7f13a5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1435,6 +1435,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 {
+	ASSERT_RTNL();
 	return raw_notifier_call_chain(&netdev_chain, val, dev);
 }
 

^ permalink raw reply related

* Re: [PATCH] gianfar: Wait for both RX and TX to stop
From: David Miller @ 2010-04-20  8:18 UTC (permalink / raw)
  To: galak; +Cc: netdev, afleming
In-Reply-To: <68CA249E-C6E9-43EA-A132-C48DB9E2384D@kernel.crashing.org>

From: Kumar Gala <galak@kernel.crashing.org>
Date: Mon, 19 Apr 2010 23:44:49 -0500

> 
> On Apr 18, 2010, at 6:13 PM, Andy Fleming wrote:
> 
>> When gracefully stopping the controller, the driver was continuing if
>> *either* RX or TX had stopped.  We need to wait for both, or the
>> controller could get into an invalid state.
>> 
>> Signed-off-by: Andy Fleming <afleming@freescale.com>
>> ---
>> drivers/net/gianfar.c |    5 +++--
>> 1 files changed, 3 insertions(+), 2 deletions(-)
> 
> Acked-by: Kumar Gala <galak@kernel.crashing.org>
> 
> (please pick this up for 2.6.34, fixes an annoying bug).

I will do this tomorrow, thanks!

^ permalink raw reply

* Re: [PATCH net-next-2.6] rps: consistent rxhash
From: David Miller @ 2010-04-20  8:18 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, therbert, netdev
In-Reply-To: <1271750198.3845.216.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Apr 2010 09:56:38 +0200

> In case we compute a software skb->rxhash, we can generate a consistent
> hash : Its value will be the same in both flow directions.
> 
> This helps some workloads, like conntracking, since the same state needs
> to be accessed in both directions.
> 
> tbench + RFS + this patch gives better results than tbench with default
> kernel configuration (no RPS, no RFS)
> 
> Also fixed some sparse warnings.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next-2.6] rps: cleanups
From: David Miller @ 2010-04-20  8:18 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, therbert, netdev
In-Reply-To: <1271747834.3845.206.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Apr 2010 09:17:14 +0200

> Le lundi 19 avril 2010 à 13:21 -0700, David Miller a écrit :
> 
>> 
>> It is getting increasingly complicated to follow who enables and
>> disabled local cpu irqs in these code paths.  We could combat
>> this by adding something like "_irq_enable()" to the function
>> names.
> 
> Yes I agree, we need a general cleanup in this file
> 
> Thanks David !
> 
> [PATCH net-next-2.6] rps: cleanups
> 

Applied.

^ permalink raw reply

* [PATCH net-next-2.6] rps: consistent rxhash
From: Eric Dumazet @ 2010-04-20  7:56 UTC (permalink / raw)
  To: Changli Gao, David Miller; +Cc: therbert, netdev
In-Reply-To: <1271743164.3845.128.camel@edumazet-laptop>

In case we compute a software skb->rxhash, we can generate a consistent
hash : Its value will be the same in both flow directions.

This helps some workloads, like conntracking, since the same state needs
to be accessed in both directions.

tbench + RFS + this patch gives better results than tbench with default
kernel configuration (no RPS, no RFS)

Also fixed some sparse warnings.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/dev.c |   25 ++++++++++++++++++-------
 1 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 05a2b29..cb150ec 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1974,7 +1974,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 	if (skb->sk && skb->sk->sk_hash)
 		hash = skb->sk->sk_hash;
 	else
-		hash = skb->protocol;
+		hash = (__force u16) skb->protocol;
 
 	hash = jhash_1word(hash, hashrnd);
 
@@ -2253,8 +2253,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 
 		ip = (struct iphdr *) skb->data;
 		ip_proto = ip->protocol;
-		addr1 = ip->saddr;
-		addr2 = ip->daddr;
+		addr1 = (__force u32) ip->saddr;
+		addr2 = (__force u32) ip->daddr;
 		ihl = ip->ihl;
 		break;
 	case __constant_htons(ETH_P_IPV6):
@@ -2263,8 +2263,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 
 		ip6 = (struct ipv6hdr *) skb->data;
 		ip_proto = ip6->nexthdr;
-		addr1 = ip6->saddr.s6_addr32[3];
-		addr2 = ip6->daddr.s6_addr32[3];
+		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
+		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
 		ihl = (40 >> 2);
 		break;
 	default:
@@ -2279,14 +2279,25 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	case IPPROTO_AH:
 	case IPPROTO_SCTP:
 	case IPPROTO_UDPLITE:
-		if (pskb_may_pull(skb, (ihl * 4) + 4))
-			ports = *((u32 *) (skb->data + (ihl * 4)));
+		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
+			__be16 *hports = (__be16 *) (skb->data + (ihl * 4));
+			u32 sport, dport;
+
+			sport = (__force u16) hports[0];
+			dport = (__force u16) hports[1];
+			if (dport < sport)
+				swap(sport, dport);
+			ports = (sport << 16) + dport;
+		}
 		break;
 
 	default:
 		break;
 	}
 
+	/* get a consistent hash (same value on both flow directions) */
+	if (addr2 < addr1)
+		swap(addr1, addr2);
 	skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
 	if (!skb->rxhash)
 		skb->rxhash = 1;



^ permalink raw reply related

* [PATCH net-next-2.6] rps: cleanups
From: Eric Dumazet @ 2010-04-20  7:17 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, therbert, netdev
In-Reply-To: <20100419.132158.143863746.davem@davemloft.net>

Le lundi 19 avril 2010 à 13:21 -0700, David Miller a écrit :

> 
> It is getting increasingly complicated to follow who enables and
> disabled local cpu irqs in these code paths.  We could combat
> this by adding something like "_irq_enable()" to the function
> names.

Yes I agree, we need a general cleanup in this file

Thanks David !

[PATCH net-next-2.6] rps: cleanups

struct softnet_data holds many queues, so consistent use "sd" name
instead of "queue" is better.

Adds a rps_ipi_queued() helper to cleanup enqueue_to_backlog()

Adds a _and_irq_disable suffix to net_rps_action() name, as David
suggested.

incr_input_queue_head() becomes input_queue_head_incr()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    4 
 net/core/dev.c            |  149 +++++++++++++++++++-----------------
 2 files changed, 82 insertions(+), 71 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 83ab3da..3c5ed5f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1401,10 +1401,10 @@ struct softnet_data {
 	struct napi_struct	backlog;
 };
 
-static inline void incr_input_queue_head(struct softnet_data *queue)
+static inline void input_queue_head_incr(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	queue->input_queue_head++;
+	sd->input_queue_head++;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 05a2b29..70df048 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -208,17 +208,17 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 }
 
-static inline void rps_lock(struct softnet_data *queue)
+static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_lock(&queue->input_pkt_queue.lock);
+	spin_lock(&sd->input_pkt_queue.lock);
 #endif
 }
 
-static inline void rps_unlock(struct softnet_data *queue)
+static inline void rps_unlock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_unlock(&queue->input_pkt_queue.lock);
+	spin_unlock(&sd->input_pkt_queue.lock);
 #endif
 }
 
@@ -2346,63 +2346,74 @@ done:
 }
 
 /* Called from hardirq (IPI) context */
-static void trigger_softirq(void *data)
+static void rps_trigger_softirq(void *data)
 {
-	struct softnet_data *queue = data;
-	__napi_schedule(&queue->backlog);
+	struct softnet_data *sd = data;
+
+	__napi_schedule(&sd->backlog);
 	__get_cpu_var(netdev_rx_stat).received_rps++;
 }
+
 #endif /* CONFIG_RPS */
 
 /*
+ * Check if this softnet_data structure is another cpu one
+ * If yes, queue it to our IPI list and return 1
+ * If no, return 0
+ */
+static int rps_ipi_queued(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
+
+	if (sd != mysd) {
+		sd->rps_ipi_next = mysd->rps_ipi_list;
+		mysd->rps_ipi_list = sd;
+
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		return 1;
+	}
+#endif /* CONFIG_RPS */
+	return 0;
+}
+
+/*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
  */
 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 			      unsigned int *qtail)
 {
-	struct softnet_data *queue;
+	struct softnet_data *sd;
 	unsigned long flags;
 
-	queue = &per_cpu(softnet_data, cpu);
+	sd = &per_cpu(softnet_data, cpu);
 
 	local_irq_save(flags);
 	__get_cpu_var(netdev_rx_stat).total++;
 
-	rps_lock(queue);
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
+	rps_lock(sd);
+	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (sd->input_pkt_queue.qlen) {
 enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			__skb_queue_tail(&sd->input_pkt_queue, skb);
 #ifdef CONFIG_RPS
-			*qtail = queue->input_queue_head +
-			    queue->input_pkt_queue.qlen;
+			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
 #endif
-			rps_unlock(queue);
+			rps_unlock(sd);
 			local_irq_restore(flags);
 			return NET_RX_SUCCESS;
 		}
 
 		/* Schedule NAPI for backlog device */
-		if (napi_schedule_prep(&queue->backlog)) {
-#ifdef CONFIG_RPS
-			if (cpu != smp_processor_id()) {
-				struct softnet_data *myqueue;
-
-				myqueue = &__get_cpu_var(softnet_data);
-				queue->rps_ipi_next = myqueue->rps_ipi_list;
-				myqueue->rps_ipi_list = queue;
-
-				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-				goto enqueue;
-			}
-#endif
-			__napi_schedule(&queue->backlog);
+		if (napi_schedule_prep(&sd->backlog)) {
+			if (!rps_ipi_queued(sd))
+				__napi_schedule(&sd->backlog);
 		}
 		goto enqueue;
 	}
 
-	rps_unlock(queue);
+	rps_unlock(sd);
 
 	__get_cpu_var(netdev_rx_stat).dropped++;
 	local_irq_restore(flags);
@@ -2903,17 +2914,17 @@ EXPORT_SYMBOL(netif_receive_skb);
 static void flush_backlog(void *arg)
 {
 	struct net_device *dev = arg;
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 	struct sk_buff *skb, *tmp;
 
-	rps_lock(queue);
-	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
+	rps_lock(sd);
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
 		if (skb->dev == dev) {
-			__skb_unlink(skb, &queue->input_pkt_queue);
+			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
-			incr_input_queue_head(queue);
+			input_queue_head_incr(sd);
 		}
-	rps_unlock(queue);
+	rps_unlock(sd);
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -3219,23 +3230,23 @@ EXPORT_SYMBOL(napi_gro_frags);
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 
 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;
 
 		local_irq_disable();
-		rps_lock(queue);
-		skb = __skb_dequeue(&queue->input_pkt_queue);
+		rps_lock(sd);
+		skb = __skb_dequeue(&sd->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
-			rps_unlock(queue);
+			rps_unlock(sd);
 			local_irq_enable();
 			break;
 		}
-		incr_input_queue_head(queue);
-		rps_unlock(queue);
+		input_queue_head_incr(sd);
+		rps_unlock(sd);
 		local_irq_enable();
 
 		__netif_receive_skb(skb);
@@ -3331,24 +3342,25 @@ EXPORT_SYMBOL(netif_napi_del);
  * net_rps_action sends any pending IPI's for rps.
  * Note: called with local irq disabled, but exits with local irq enabled.
  */
-static void net_rps_action(void)
+static void net_rps_action_and_irq_disable(void)
 {
 #ifdef CONFIG_RPS
-	struct softnet_data *locqueue = &__get_cpu_var(softnet_data);
-	struct softnet_data *remqueue = locqueue->rps_ipi_list;
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *remsd = sd->rps_ipi_list;
 
-	if (remqueue) {
-		locqueue->rps_ipi_list = NULL;
+	if (remsd) {
+		sd->rps_ipi_list = NULL;
 
 		local_irq_enable();
 
 		/* Send pending IPI's to kick RPS processing on remote cpus. */
-		while (remqueue) {
-			struct softnet_data *next = remqueue->rps_ipi_next;
-			if (cpu_online(remqueue->cpu))
-				__smp_call_function_single(remqueue->cpu,
-							   &remqueue->csd, 0);
-			remqueue = next;
+		while (remsd) {
+			struct softnet_data *next = remsd->rps_ipi_next;
+
+			if (cpu_online(remsd->cpu))
+				__smp_call_function_single(remsd->cpu,
+							   &remsd->csd, 0);
+			remsd = next;
 		}
 	} else
 #endif
@@ -3423,7 +3435,7 @@ static void net_rx_action(struct softirq_action *h)
 		netpoll_poll_unlock(have);
 	}
 out:
-	net_rps_action();
+	net_rps_action_and_irq_disable();
 
 #ifdef CONFIG_NET_DMA
 	/*
@@ -5595,7 +5607,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
-		incr_input_queue_head(oldsd);
+		input_queue_head_incr(oldsd);
 	}
 
 	return NOTIFY_OK;
@@ -5812,24 +5824,23 @@ static int __init net_dev_init(void)
 	 */
 
 	for_each_possible_cpu(i) {
-		struct softnet_data *queue;
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
-		queue = &per_cpu(softnet_data, i);
-		skb_queue_head_init(&queue->input_pkt_queue);
-		queue->completion_queue = NULL;
-		INIT_LIST_HEAD(&queue->poll_list);
+		skb_queue_head_init(&sd->input_pkt_queue);
+		sd->completion_queue = NULL;
+		INIT_LIST_HEAD(&sd->poll_list);
 
 #ifdef CONFIG_RPS
-		queue->csd.func = trigger_softirq;
-		queue->csd.info = queue;
-		queue->csd.flags = 0;
-		queue->cpu = i;
+		sd->csd.func = rps_trigger_softirq;
+		sd->csd.info = sd;
+		sd->csd.flags = 0;
+		sd->cpu = i;
 #endif
 
-		queue->backlog.poll = process_backlog;
-		queue->backlog.weight = weight_p;
-		queue->backlog.gro_list = NULL;
-		queue->backlog.gro_count = 0;
+		sd->backlog.poll = process_backlog;
+		sd->backlog.weight = weight_p;
+		sd->backlog.gro_list = NULL;
+		sd->backlog.gro_count = 0;
 	}
 
 	dev_boot_phase = 0;



^ permalink raw reply related

* Re: [PATCH v5] rfs: Receive Flow Steering
From: Eric Dumazet @ 2010-04-20  5:59 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, therbert, netdev
In-Reply-To: <y2u412e6f7f1004191638mee9206dfoab7482bbff83e38d@mail.gmail.com>

Le mardi 20 avril 2010 à 07:38 +0800, Changli Gao a écrit :

> Does this problem has relationship with your patch? No. If the rxhash
> isn't provided by hardware, we can get more throughput from you patch,
> and on the other side, we don't lose anything but potential more hash
> collision.
> 

I am not sure what you call hash collision. There is no hash chain here.

This 32bit hash is a jhash one, and we only need 1 to 12 bits in it, I
am pretty sure its OK.




^ permalink raw reply

* Re: [PATCH net-next-2.6] rps: shortcut net_rps_action()
From: Eric Dumazet @ 2010-04-20  5:55 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, Tom Herbert, netdev
In-Reply-To: <p2j412e6f7f1004191732p41094214yf82d2755f9081d1d@mail.gmail.com>

Le mardi 20 avril 2010 à 08:32 +0800, Changli Gao a écrit :

> Oh, I read the code again and got the answer. After the IPI is sent,
> this softnet will be queued by the other CPUs. We prefetch the pointer
> rps_ipi_next to avoid this race condition.
> 

Speaking of prefetch business,

I partly tested following patch, I will submit it if it happens to be a
clear win.

diff --git a/net/core/dev.c b/net/core/dev.c
index 05a2b29..fe6fc9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2349,7 +2349,9 @@ done:
 static void trigger_softirq(void *data)
 {
 	struct softnet_data *queue = data;
+
 	__napi_schedule(&queue->backlog);
+	prefetch(queue->input_pkt_queue.next);
 	__get_cpu_var(netdev_rx_stat).received_rps++;
 }
 #endif /* CONFIG_RPS */



^ permalink raw reply related

* Re: [PATCH] rps: send IPIs ASAP
From: Eric Dumazet @ 2010-04-20  5:45 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Changli Gao, David S. Miller, netdev
In-Reply-To: <g2y65634d661004192215oc06d9c43xb2a7c7d946149cf5@mail.gmail.com>

Le lundi 19 avril 2010 à 22:15 -0700, Tom Herbert a écrit :
> On Mon, Apr 19, 2010 at 9:08 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> > rps: send IPIs ASAP
> >
> > In order to reduce latency, we'd better send IPIs ASAP to schedule the
> > corresponding NAPIs.
> >
> A design point of RPS is that we generate at most one IPI per CPU per
> device interrupt, which at least offers some predictable coalescing.
> With your changes, we would get at most one IPI per packet-- that
> could represent a lot more of them.  Did you test this to see what the
> impact is in this regard?
> 

I agree with you Tom. Coalescing IPI is probably better.

If the receiver CPU got a single packet in its RX handling, latency will
be the same anyway.

If the receiver CPU got many packets, chance is high we are in a stress
situation, and coalescing is a win in this case.

I am currently testing a patch to call net_rps_action() at the beginning
of process_backlog() (if we have a non null ipi_rps_list pointer)

Will post a patch with bench results




^ permalink raw reply

* Re: [PATCH] rps: send IPIs ASAP
From: David Miller @ 2010-04-20  5:39 UTC (permalink / raw)
  To: therbert; +Cc: xiaosuo, eric.dumazet, netdev
In-Reply-To: <g2y65634d661004192215oc06d9c43xb2a7c7d946149cf5@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Mon, 19 Apr 2010 22:15:58 -0700

> Did you test this to see what the impact is in this regard?

Changli it would help immensely if you posted performance
test results along with your changes.

I can see quite obviously that this change will completely undo the
intentional batching of IPIs done by RPS.  IPIs are expensive and we
should batch things as much as possible here.

^ permalink raw reply

* Re: [PATCH] rps: send IPIs ASAP
From: Tom Herbert @ 2010-04-20  5:15 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, Eric Dumazet, netdev
In-Reply-To: <1271736519-2991-1-git-send-email-xiaosuo@gmail.com>

On Mon, Apr 19, 2010 at 9:08 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> rps: send IPIs ASAP
>
> In order to reduce latency, we'd better send IPIs ASAP to schedule the
> corresponding NAPIs.
>
A design point of RPS is that we generate at most one IPI per CPU per
device interrupt, which at least offers some predictable coalescing.
With your changes, we would get at most one IPI per packet-- that
could represent a lot more of them.  Did you test this to see what the
impact is in this regard?


> For NAPI drivers, we send IPIs immediately, and for the others, we defer them
> to NET_RX_SOFTIRQ. In this patch, we move net_rps_action() to the beginning of
> net_rx_action() to emulate a softirq with the higher priority than
> NET_RX_SOFTIRQ.
>
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  net/core/dev.c |   23 ++++++++++++++++++++---
>  1 file changed, 20 insertions(+), 3 deletions(-)
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 05a2b29..d8fca21 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2363,6 +2363,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>  {
>        struct softnet_data *queue;
>        unsigned long flags;
> +#ifdef CONFIG_RPS
> +       bool ni = !in_irq();
> +       bool ipi = false;
> +#endif
>
>        queue = &per_cpu(softnet_data, cpu);
>
> @@ -2380,6 +2384,10 @@ enqueue:
>  #endif
>                        rps_unlock(queue);
>                        local_irq_restore(flags);
> +#ifdef CONFIG_RPS
> +                       if (ipi)
> +                               __smp_call_function_single(cpu, &queue->csd, 0);
> +#endif
>                        return NET_RX_SUCCESS;
>                }
>
> @@ -2389,6 +2397,11 @@ enqueue:
>                        if (cpu != smp_processor_id()) {
>                                struct softnet_data *myqueue;
>
> +                               if (ni) {
> +                                       ipi = true;
> +                                       goto enqueue;
> +                               }
> +
>                                myqueue = &__get_cpu_var(softnet_data);
>                                queue->rps_ipi_next = myqueue->rps_ipi_list;
>                                myqueue->rps_ipi_list = queue;
> @@ -3337,6 +3350,7 @@ static void net_rps_action(void)
>        struct softnet_data *locqueue = &__get_cpu_var(softnet_data);
>        struct softnet_data *remqueue = locqueue->rps_ipi_list;
>
> +       local_irq_disable();
>        if (remqueue) {
>                locqueue->rps_ipi_list = NULL;
>
> @@ -3350,9 +3364,10 @@ static void net_rps_action(void)
>                                                           &remqueue->csd, 0);
>                        remqueue = next;
>                }
> -       } else
> -#endif
> +       } else {
>                local_irq_enable();
> +       }
> +#endif
>  }
>
>  static void net_rx_action(struct softirq_action *h)
> @@ -3362,6 +3377,8 @@ static void net_rx_action(struct softirq_action *h)
>        int budget = netdev_budget;
>        void *have;
>
> +       net_rps_action();
> +
>        local_irq_disable();
>
>        while (!list_empty(list)) {
> @@ -3423,7 +3440,7 @@ static void net_rx_action(struct softirq_action *h)
>                netpoll_poll_unlock(have);
>        }
>  out:
> -       net_rps_action();
> +       local_irq_enable();
>
>  #ifdef CONFIG_NET_DMA
>        /*
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox