[PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
@ 2014-07-23  6:37 Alex Gartrell
  2014-07-23  6:37 ` [PATCH ipvs 2/2] ipvs: only perform slow checksum on NF_INET_LOCAL_OUT Alex Gartrell
  2014-07-23  8:25 ` [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Julian Anastasov
  0 siblings, 2 replies; 10+ messages in thread
From: Alex Gartrell @ 2014-07-23  6:37 UTC (permalink / raw)
  To: horms; +Cc: ja, lvs-devel, agartrell, kernel-team

The combination of NF_INET_LOCAL_OUT and hardware checksum offload results
in picking the packet out of the send path before it gets proper checksums.
Our NICs (most NICs?) cannot do the checksumming in ipip packets, so these
faulty tcp/udp headers arrive at their destination and are discarded.

Instead, checksum the tcp/udp packets in the tunnel transmit path.  Right
now we do this for all packets, but a subsequent patch will limit this to
the NF_INET_LOCAL_OUT hook.

Signed-off-by: Alex Gartrell <agartrell@fb.com>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 49 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6f70bdd..e9b5e6e 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -37,6 +37,7 @@
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
+#include <net/ip6_checksum.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <linux/icmpv6.h>
@@ -824,6 +825,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int ret, local;
+	unsigned int l4_len = 0;
 
 	EnterFunction(10);
 
@@ -862,6 +864,29 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
+	{
+		/* ipip breaks layer 4 checksumming on many (all?) NICs, so
+		 * we must do it ourselves instead of relying upon checksum
+		 * offload */
+		l4_len = ntohs(old_iph->tot_len) - (old_iph->ihl * 4);
+		switch (old_iph->protocol) {
+		case IPPROTO_TCP:
+			tcp_hdr(skb)->check = 0;
+			tcp_hdr(skb)->check = tcp_v4_check(
+				l4_len, old_iph->saddr, old_iph->daddr,
+				csum_partial(tcp_hdr(skb), l4_len, 0));
+			break;
+		case IPPROTO_UDP:
+			udp_hdr(skb)->check = 0;
+			udp_hdr(skb)->check = udp_v4_check(
+				l4_len, old_iph->saddr, old_iph->daddr,
+				csum_partial(udp_hdr(skb), l4_len, 0));
+			break;
+		default:
+			break;
+		}
+	}
+
 	skb->transport_header = skb->network_header;
 
 	/* fix old IP header checksum */
@@ -918,6 +943,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int ret, local;
+	unsigned int l4_len = 0;
 
 	EnterFunction(10);
 
@@ -953,6 +979,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
+	{
+		/* ipip breaks layer 4 checksumming on many (all?) NICs, so
+		 * we must do it ourselves instead of relying upon checksum
+		 * offload */
+		l4_len = ntohs(old_iph->payload_len);
+		switch (old_iph->nexthdr) {
+		case IPPROTO_TCP:
+			tcp_hdr(skb)->check = 0;
+			tcp_hdr(skb)->check = tcp_v6_check(
+				l4_len, &old_iph->saddr, &old_iph->daddr,
+				csum_partial(tcp_hdr(skb), l4_len, 0));
+			break;
+		case IPPROTO_UDP:
+			udp_hdr(skb)->check = 0;
+			udp_hdr(skb)->check = udp_v6_check(
+				l4_len, &old_iph->saddr, &old_iph->daddr,
+				csum_partial(udp_hdr(skb), l4_len, 0));
+			break;
+		default:
+			break;
+		}
+	}
+
 	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH ipvs 2/2] ipvs: only perform slow checksum on NF_INET_LOCAL_OUT
  2014-07-23  6:37 [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Alex Gartrell
@ 2014-07-23  6:37 ` Alex Gartrell
  2014-07-23  8:25 ` [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Julian Anastasov
  1 sibling, 0 replies; 10+ messages in thread
From: Alex Gartrell @ 2014-07-23  6:37 UTC (permalink / raw)
  To: horms; +Cc: ja, lvs-devel, agartrell, kernel-team

In order to know when we may need to tcp checksum, we need to propagate the
hooknum through packet_xmit as well as through conn_schedule (as it may
call ip_vs_leave, which itself invokes packet_xmit).

Signed-off-by: Alex Gartrell <agartrell@fb.com>
---
 include/net/ip_vs.h                     | 36 ++++++++++++++++++++++-----------
 net/netfilter/ipvs/ip_vs_core.c         | 10 +++++----
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |  2 +-
 net/netfilter/ipvs/ip_vs_proto_sctp.c   |  5 +++--
 net/netfilter/ipvs/ip_vs_proto_tcp.c    |  5 +++--
 net/netfilter/ipvs/ip_vs_proto_udp.c    |  5 +++--
 net/netfilter/ipvs/ip_vs_xmit.c         | 35 ++++++++++++++++++++------------
 7 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 624a8a5..a31b435 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -448,7 +448,8 @@ struct ip_vs_protocol {
 	int (*conn_schedule)(int af, struct sk_buff *skb,
 			     struct ip_vs_proto_data *pd,
 			     int *verdict, struct ip_vs_conn **cpp,
-			     struct ip_vs_iphdr *iph);
+			     struct ip_vs_iphdr *iph,
+			     int hooknum);
 
 	struct ip_vs_conn *
 	(*conn_in_get)(int af,
@@ -566,7 +567,8 @@ struct ip_vs_conn {
 	   NF_ACCEPT can be returned when destination is local.
 	 */
 	int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
-			   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+			   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+			   int hooknum);
 
 	/* Note: we can group the following members into a structure,
 	   in order to save more space, and the following members are
@@ -1371,7 +1373,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	       struct ip_vs_proto_data *pd, int *ignored,
 	       struct ip_vs_iphdr *iph);
 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
-		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph);
+		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph,
+		int hooknum);
 
 void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg);
 
@@ -1439,15 +1442,20 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
  *	Various IPVS packet transmitters (from ip_vs_xmit.c)
  */
 int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		    struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		    struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		    int hooknum);
 int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		      struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		      struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		      int hooknum);
 int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		   struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		   int hooknum);
 int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		      struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		      struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		      int hooknum);
 int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		  int hooknum);
 int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		    struct ip_vs_protocol *pp, int offset,
 		    unsigned int hooknum, struct ip_vs_iphdr *iph);
@@ -1455,13 +1463,17 @@ void ip_vs_dest_dst_rcu_free(struct rcu_head *head);
 
 #ifdef CONFIG_IP_VS_IPV6
 int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-			 struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+			 struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+			 int hooknum);
 int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		      struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		      struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		      int hooknum);
 int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-			 struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+			 struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+			 int hooknum);
 int ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph,
+		     int hooknum);
 int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		       struct ip_vs_protocol *pp, int offset,
 		       unsigned int hooknum, struct ip_vs_iphdr *iph);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e683675..613a125 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -507,7 +507,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
  *  no destination is available for a new connection.
  */
 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
-		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
+		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph,
+		int hooknum)
 {
 	__be16 _ports[2], *pptr;
 #ifdef CONFIG_SYSCTL
@@ -564,7 +565,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 
 		/* transmit the first SYN packet */
-		ret = cp->packet_xmit(skb, cp, pd->pp, iph);
+		ret = cp->packet_xmit(skb, cp, pd->pp, iph, hooknum);
 		/* do not touch skb anymore */
 
 		atomic_inc(&cp->in_pkts);
@@ -1635,6 +1636,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	pd = ip_vs_proto_data_get(net, iph.protocol);
 	if (unlikely(!pd))
 		return NF_ACCEPT;
+
 	pp = pd->pp;
 	/*
 	 * Check if the packet belongs to an existing connection entry
@@ -1656,7 +1658,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 		int v;
 
 		/* Schedule and create new connection entry into &cp */
-		if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
+		if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph, hooknum))
 			return v;
 	}
 
@@ -1692,7 +1694,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	ip_vs_in_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 	if (cp->packet_xmit)
-		ret = cp->packet_xmit(skb, cp, pp, &iph);
+		ret = cp->packet_xmit(skb, cp, pp, &iph, hooknum);
 		/* do not touch skb anymore */
 	else {
 		IP_VS_DBG_RL("warning: packet_xmit is null");
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 5de3dd3..169eaa3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -109,7 +109,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 static int
 ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		     int *verdict, struct ip_vs_conn **cpp,
-		     struct ip_vs_iphdr *iph)
+		     struct ip_vs_iphdr *iph, int hooknum)
 {
 	/*
 	 * AH/ESP is only related traffic. Pass the packet to IP stack.
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 2f7ea75..aec76c9 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -11,7 +11,7 @@
 static int
 sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		   int *verdict, struct ip_vs_conn **cpp,
-		   struct ip_vs_iphdr *iph)
+		   struct ip_vs_iphdr *iph, int hooknum)
 {
 	struct net *net;
 	struct ip_vs_service *svc;
@@ -56,7 +56,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
-				*verdict = ip_vs_leave(svc, skb, pd, iph);
+				*verdict = ip_vs_leave(svc, skb, pd, iph,
+						       hooknum);
 			else
 				*verdict = NF_DROP;
 			rcu_read_unlock();
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index e3a6972..1baf90d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -34,7 +34,7 @@
 static int
 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		  int *verdict, struct ip_vs_conn **cpp,
-		  struct ip_vs_iphdr *iph)
+		  struct ip_vs_iphdr *iph, int hooknum)
 {
 	struct net *net;
 	struct ip_vs_service *svc;
@@ -72,7 +72,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
-				*verdict = ip_vs_leave(svc, skb, pd, iph);
+				*verdict = ip_vs_leave(svc, skb, pd, iph,
+						       hooknum);
 			else
 				*verdict = NF_DROP;
 			rcu_read_unlock();
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index b62a3c0..9eeb752 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -31,7 +31,7 @@
 static int
 udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		  int *verdict, struct ip_vs_conn **cpp,
-		  struct ip_vs_iphdr *iph)
+		  struct ip_vs_iphdr *iph, int hooknum)
 {
 	struct net *net;
 	struct ip_vs_service *svc;
@@ -67,7 +67,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
-				*verdict = ip_vs_leave(svc, skb, pd, iph);
+				*verdict = ip_vs_leave(svc, skb, pd, iph,
+						       hooknum);
 			else
 				*verdict = NF_DROP;
 			rcu_read_unlock();
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index e9b5e6e..91bf1d5 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -535,7 +535,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
  */
 int
 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		int hooknum)
 {
 	/* we do not touch skb and do not need pskb ptr */
 	return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
@@ -549,7 +550,8 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  */
 int
 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		  int hooknum)
 {
 	struct iphdr  *iph = ip_hdr(skb);
 
@@ -581,7 +583,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		     int hooknum)
 {
 	EnterFunction(10);
 
@@ -613,7 +616,8 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  */
 int
 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-	       struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+	       struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+	       int hooknum)
 {
 	struct rtable *rt;		/* Route to the other host */
 	int local, rc, was_input;
@@ -703,7 +707,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		  int hooknum)
 {
 	struct rt6_info *rt;		/* Route to the other host */
 	int local, rc;
@@ -813,7 +818,8 @@ tx_error:
  */
 int
 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		  int hooknum)
 {
 	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
 	struct rtable *rt;			/* Route to the other host */
@@ -864,7 +870,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
-	{
+	if (hooknum == NF_INET_LOCAL_OUT) {
 		/* ipip breaks layer 4 checksumming on many (all?) NICs, so
 		 * we must do it ourselves instead of relying upon checksum
 		 * offload */
@@ -934,7 +940,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		     int hooknum)
 {
 	struct rt6_info *rt;		/* Route to the other host */
 	struct in6_addr saddr;		/* Source for tunnel */
@@ -979,7 +986,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
-	{
+	if (hooknum == NF_INET_LOCAL_OUT) {
 		/* ipip breaks layer 4 checksumming on many (all?) NICs, so
 		 * we must do it ourselves instead of relying upon checksum
 		 * offload */
@@ -1051,7 +1058,8 @@ tx_error:
  */
 int
 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-	      struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+	      struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+	      int hooknum)
 {
 	int local;
 
@@ -1090,7 +1098,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 #ifdef CONFIG_IP_VS_IPV6
 int
 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh,
+		 int hooknum)
 {
 	int local;
 
@@ -1147,7 +1156,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	   translate address/port back */
 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
 		if (cp->packet_xmit)
-			rc = cp->packet_xmit(skb, cp, pp, iph);
+			rc = cp->packet_xmit(skb, cp, pp, iph, hooknum);
 		else
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
@@ -1239,7 +1248,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	   translate address/port back */
 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
 		if (cp->packet_xmit)
-			rc = cp->packet_xmit(skb, cp, pp, ipvsh);
+			rc = cp->packet_xmit(skb, cp, pp, ipvsh, hooknum);
 		else
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
-- 
1.8.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-23  6:37 [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Alex Gartrell
  2014-07-23  6:37 ` [PATCH ipvs 2/2] ipvs: only perform slow checksum on NF_INET_LOCAL_OUT Alex Gartrell
@ 2014-07-23  8:25 ` Julian Anastasov
  2014-07-23 19:16   ` Alex Gartrell
  1 sibling, 1 reply; 10+ messages in thread
From: Julian Anastasov @ 2014-07-23  8:25 UTC (permalink / raw)
  To: Alex Gartrell; +Cc: horms, lvs-devel, kernel-team


	Hello,

On Tue, 22 Jul 2014, Alex Gartrell wrote:

> The combination of NF_INET_LOCAL_OUT and hardware checksum offload results
> in picking the packet out of the send path before it gets proper checksums.
> Our NICs (most NICs?) cannot do the checksumming in ipip packets, so these
> faulty tcp/udp headers arrive at their destination and are discarded.
> 
> Instead, checksum the tcp/udp packets in the tunnel transmit path.  Right
> now we do this for all packets, but a subsequent patch will limit this to
> the NF_INET_LOCAL_OUT hook.

	After the support for hardware-offloaded encapsulation
went in we may lack some support, like playing with skb->encapsulation, 
etc. I'm not sure calculating the checksums in all
cases (ip_summed) is the right thing to do.

	For DR and TUN we tried to avoid checking and
calculating checksum because we do not mangle the payload.
But CHECKSUM_PARTIAL in ip_summed is a special case.
We call skb_forward_csum() from ip_vs_tunnel_xmit_prepare
but may be it is not enough. Such interesting case is
local TCP client (CHECKSUM_PARTIAL) forwarded via TUN.

	ipip.c and ip_tunnel_core.c are good source of
information. I think, we can check what is needed
to complete the partial checksum while adding IPIP header.
For example, calling skb_checksum_help() may be
enough for the CHECKSUM_PARTIAL case. See
iptunnel_handle_offloads() for reference.

	Can you give more information, eg. what
value you see in ip_summed, leading to these problems?
And whether packet to IPVS comes from local stack or
from remote client. Is TCP/UDP checksum offload running?

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-23  8:25 ` [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Julian Anastasov
@ 2014-07-23 19:16   ` Alex Gartrell
  2014-07-23 22:54     ` Julian Anastasov
  0 siblings, 1 reply; 10+ messages in thread
From: Alex Gartrell @ 2014-07-23 19:16 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team

Thank you for the review!

> 
> 	Hello,
> 
> On Tue, 22 Jul 2014, Alex Gartrell wrote:
> 
> > The combination of NF_INET_LOCAL_OUT and hardware checksum offload
> > results in picking the packet out of the send path before it gets
> > proper checksums. Our NICs (most NICs?) cannot do the checksumming
> > in ipip packets, so these faulty tcp/udp headers arrive at their
> > destination and are discarded.
> > 
> > Instead, checksum the tcp/udp packets in the tunnel transmit path.
> > Right now we do this for all packets, but a subsequent patch will
> > limit this to the NF_INET_LOCAL_OUT hook.
> 
> 	After the support for hardware-offloaded encapsulation
> went in we may lack some support, like playing with
> skb->encapsulation, etc. I'm not sure calculating the checksums in all
> cases (ip_summed) is the right thing to do.
> 
> 	For DR and TUN we tried to avoid checking and
> calculating checksum because we do not mangle the payload.
> But CHECKSUM_PARTIAL in ip_summed is a special case.
> We call skb_forward_csum() from ip_vs_tunnel_xmit_prepare
> but may be it is not enough. Such interesting case is
> local TCP client (CHECKSUM_PARTIAL) forwarded via TUN.

Yes, it appears that skb_checksum_help was the magic function I was hoping
for.  I'll put up a new patch that invokes this function, as it seems to
fix the problem by itself (and by checking the ip_summed field, it .

> 	Can you give more information, eg. what
> value you see in ip_summed, leading to these problems?
> And whether packet to IPVS comes from local stack or
> from remote client. Is TCP/UDP checksum offload running?
> 

Three tests:
* From a remote server: CHECKSUM_UNNECESSARY
* From the local server with hardware checksum offload enabled:
  CHECKSUM_PARTIAL
* From the local server with hardware checksum offload disabled:
  CHECKSUM_PARTIAL

I was mildly surprised by the last one.  In general I think that ip_summed
is a better way to figure out whether or not we should checksum though.



In any event, given that the skb_checksum_helper function is way more
likely to "just work," I'll put out a patch that adds this invocation
instead.


Thanks,
Alex

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-23 19:16   ` Alex Gartrell
@ 2014-07-23 22:54     ` Julian Anastasov
  2014-07-24  6:22       ` Julian Anastasov
  0 siblings, 1 reply; 10+ messages in thread
From: Julian Anastasov @ 2014-07-23 22:54 UTC (permalink / raw)
  To: Alex Gartrell; +Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team


	Hello,

On Wed, 23 Jul 2014, Alex Gartrell wrote:

> Three tests:
> * From a remote server: CHECKSUM_UNNECESSARY
> * From the local server with hardware checksum offload enabled:
>   CHECKSUM_PARTIAL
> * From the local server with hardware checksum offload disabled:
>   CHECKSUM_PARTIAL
> 
> I was mildly surprised by the last one.  In general I think that ip_summed
> is a better way to figure out whether or not we should checksum though.
> 
> 
> 
> In any event, given that the skb_checksum_helper function is way more
> likely to "just work," I'll put out a patch that adds this invocation
> instead.

	There is skb_checksum_help() call in
dev_hard_start_xmit(), it is used when TCP/UDP offload is
not supported. After checking the code I see only the
CHECKSUM_PARTIAL + Enabled TCP/UDP CSum as a problem because
the drivers use the skb->encapsulation flag to know where the
TCP/UDP header resides. As tunnels prepend tunnel header
they should set skb->encapsulation=1 together with
calling skb_reset_inner_headers() before the header is
inserted.

	Following is a patch for the net tree that needs
testing because I don't have setup to fully test it.
I hope you can try it on your setup for the 3 tests.
It should have effect only on your 2nd test. Do you
have problems with tests 1 and 3?

[PATCH net] ipvs: properly declare tunnel encapsulation

The tunneling method should properly use tunnel encapsulation.
Fixes problem with CHECKSUM_PARTIAL packets when TCP/UDP csum
offload is supported and skb->encapsulation is not set to 1.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 73ba1cc..46b43ec 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -38,6 +38,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/ip_tunnels.h>
 #include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
@@ -483,10 +484,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	skb->ipvs_property = 1;
 	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
 		ret = ip_vs_confirm_conntrack(skb);
-	if (ret == NF_ACCEPT) {
+	if (ret == NF_ACCEPT)
 		nf_reset(skb);
-		skb_forward_csum(skb);
-	}
 	return ret;
 }
 
@@ -867,9 +866,14 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
 
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	if (IS_ERR(skb))
+		goto tx_error_unlock;
+
 	skb_push(skb, sizeof(struct iphdr));
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	skb_clear_hash(skb);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -901,6 +905,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
   tx_error:
 	kfree_skb(skb);
+
+tx_error_unlock:
 	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -955,6 +961,12 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	skb->transport_header = skb->network_header;
 
+	if (!skb->encapsulation) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+	skb_forward_csum(skb);
+
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-- 
1.9.0

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* RE: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-23 22:54     ` Julian Anastasov
@ 2014-07-24  6:22       ` Julian Anastasov
  2014-07-24 22:29         ` Alex Gartrell
  0 siblings, 1 reply; 10+ messages in thread
From: Julian Anastasov @ 2014-07-24  6:22 UTC (permalink / raw)
  To: Alex Gartrell; +Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team


	Hello,

On Thu, 24 Jul 2014, Julian Anastasov wrote:

> 	There is skb_checksum_help() call in
> dev_hard_start_xmit(), it is used when TCP/UDP offload is
> not supported. After checking the code I see only the
> CHECKSUM_PARTIAL + Enabled TCP/UDP CSum as a problem because
> the drivers use the skb->encapsulation flag to know where the
> TCP/UDP header resides. As tunnels prepend tunnel header
> they should set skb->encapsulation=1 together with
> calling skb_reset_inner_headers() before the header is
> inserted.
> 
> 	Following is a patch for the net tree that needs
> testing because I don't have setup to fully test it.
> I hope you can try it on your setup for the 3 tests.
> It should have effect only on your 2nd test. Do you
> have problems with tests 1 and 3?

	I just saw that changing skb->transport_header
before skb_reset_inner_headers() and iptunnel_handle_offloads()
can cause problems. Not sure if skb->transport_header
is used later when skb->encapsulation = 1, the strange thing
is that ip6_tnl_xmit2() changes it before skb_reset_inner_headers(),
this can lead to wrong skb->inner_transport_header.

[PATCH net] ipvs: properly declare tunnel encapsulation

The tunneling method should properly use tunnel encapsulation.
Fixes problem with CHECKSUM_PARTIAL packets when TCP/UDP csum
offload is supported and skb->encapsulation is not set to 1.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 73ba1cc..e093580 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -38,6 +38,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/ip_tunnels.h>
 #include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
@@ -483,10 +484,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	skb->ipvs_property = 1;
 	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
 		ret = ip_vs_confirm_conntrack(skb);
-	if (ret == NF_ACCEPT) {
+	if (ret == NF_ACCEPT)
 		nf_reset(skb);
-		skb_forward_csum(skb);
-	}
 	return ret;
 }
 
@@ -862,14 +861,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
-	skb->transport_header = skb->network_header;
-
 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
 
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	if (IS_ERR(skb))
+		goto tx_error_unlock;
+
+	skb->transport_header = skb->network_header;
+
 	skb_push(skb, sizeof(struct iphdr));
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	skb_clear_hash(skb);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -901,6 +905,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
   tx_error:
 	kfree_skb(skb);
+
+tx_error_unlock:
 	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -953,6 +959,12 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
+	if (!skb->encapsulation) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+	skb_forward_csum(skb);
+
 	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
-- 
1.9.0

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-24  6:22       ` Julian Anastasov
@ 2014-07-24 22:29         ` Alex Gartrell
  2014-07-25  4:30           ` Julian Anastasov
  0 siblings, 1 reply; 10+ messages in thread
From: Alex Gartrell @ 2014-07-24 22:29 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team

Hey,

On Thu, 24 Jul 2014 09:22:19 +0300
Julian Anastasov <ja@ssi.bg> wrote:

> 
> 	Hello,
> 
> On Thu, 24 Jul 2014, Julian Anastasov wrote:
> 
> > 	There is skb_checksum_help() call in
> > dev_hard_start_xmit(), it is used when TCP/UDP offload is
> > not supported. After checking the code I see only the
> > CHECKSUM_PARTIAL + Enabled TCP/UDP CSum as a problem because
> > the drivers use the skb->encapsulation flag to know where the
> > TCP/UDP header resides. As tunnels prepend tunnel header
> > they should set skb->encapsulation=1 together with
> > calling skb_reset_inner_headers() before the header is
> > inserted.
> > 
> > 	Following is a patch for the net tree that needs
> > testing because I don't have setup to fully test it.
> > I hope you can try it on your setup for the 3 tests.
> > It should have effect only on your 2nd test. Do you
> > have problems with tests 1 and 3?
> 
> 	I just saw that changing skb->transport_header
> before skb_reset_inner_headers() and iptunnel_handle_offloads()
> can cause problems. Not sure if skb->transport_header
> is used later when skb->encapsulation = 1, the strange thing
> is that ip6_tnl_xmit2() changes it before skb_reset_inner_headers(),
> this can lead to wrong skb->inner_transport_header.
> 
> [PATCH net] ipvs: properly declare tunnel encapsulation
> 
> The tunneling method should properly use tunnel encapsulation.
> Fixes problem with CHECKSUM_PARTIAL packets when TCP/UDP csum
> offload is supported and skb->encapsulation is not set to 1.
> 
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> ---
>  net/netfilter/ipvs/ip_vs_xmit.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c
> b/net/netfilter/ipvs/ip_vs_xmit.c index 73ba1cc..e093580 100644
> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> @@ -38,6 +38,7 @@
>  #include <net/route.h>                  /* for ip_route_output */
>  #include <net/ipv6.h>
>  #include <net/ip6_route.h>
> +#include <net/ip_tunnels.h>
>  #include <net/addrconf.h>
>  #include <linux/icmpv6.h>
>  #include <linux/netfilter.h>
> @@ -483,10 +484,8 @@ static inline int
> ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, skb->ipvs_property = 1;
>  	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
>  		ret = ip_vs_confirm_conntrack(skb);
> -	if (ret == NF_ACCEPT) {
> +	if (ret == NF_ACCEPT)
>  		nf_reset(skb);
> -		skb_forward_csum(skb);
> -	}
>  	return ret;
>  }
>  
> @@ -862,14 +861,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct
> ip_vs_conn *cp, old_iph = ip_hdr(skb);
>  	}
>  
> -	skb->transport_header = skb->network_header;
> -
>  	/* fix old IP header checksum */
>  	ip_send_check(old_iph);
>  
> +	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
> +	if (IS_ERR(skb))
> +		goto tx_error_unlock;
> +
> +	skb->transport_header = skb->network_header;
> +
>  	skb_push(skb, sizeof(struct iphdr));
>  	skb_reset_network_header(skb);
>  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
> +	skb_clear_hash(skb);
>  
>  	/*
>  	 *	Push down and install the IPIP header.
> @@ -901,6 +905,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct
> ip_vs_conn *cp, 
>    tx_error:
>  	kfree_skb(skb);
> +
> +tx_error_unlock:
>  	rcu_read_unlock();
>  	LeaveFunction(10);
>  	return NF_STOLEN;
> @@ -953,6 +959,12 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct
> ip_vs_conn *cp, old_iph = ipv6_hdr(skb);
>  	}
>  
> +	if (!skb->encapsulation) {
> +		skb_reset_inner_headers(skb);
> +		skb->encapsulation = 1;
> +	}
> +	skb_forward_csum(skb);
> +
>  	skb->transport_header = skb->network_header;
>  
>  	skb_push(skb, sizeof(struct ipv6hdr));

So I've found a patch that addresses the problem in what I suspect is the
right way.

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6f70bdd..ea7ef5e 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -486,6 +486,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
        if (ret == NF_ACCEPT) {
                nf_reset(skb);
                skb_forward_csum(skb);
+               skb->encapsulation = 1;
        }
        return ret;
 }

Empirically, this solves my problem :) I believe it solves the problem more
generally as well.

Digging deeper into the v6 case (FB <3's v6)

ip_vs_tunnel_xmit_v6
-> ip6_local_out
-> __ip6_local_out
-> dst_output
-> dst_output_sk
-> skb_dst(skb)->output (= ip6_output)
-> ip6_finish_output
-> ip6->finish_output2
-> neigh_direct_output
-> dev_queue_xmit
-> __dev_queue_xmit
-> dev_hard_start

And then we run into this:

        /* If encapsulation offload request, verify we are testing
         * hardware encapsulation features instead of standard
         * features for the netdev
         */
        if (skb->encapsulation)
                features &= dev->hw_enc_features;

This essentially strips out NETIF_F_ALL_CSUM, so we end up invoking
skb_checksum_help below.


        /* If packet is not checksummed and device does not
         * support checksumming for this protocol, complete
         * checksumming here.
         */
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                if (skb->encapsulation)
                        skb_set_inner_transport_header(skb,
                                skb_checksum_start_offset(skb));
                else
                        skb_set_transport_header(skb,
                                skb_checksum_start_offset(skb));
                if (!(features & NETIF_F_ALL_CSUM) &&
                     skb_checksum_help(skb))
                        goto out_kfree_skb;
        }

Does this seem like a reasonable approach to you, Julian?

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-24 22:29         ` Alex Gartrell
@ 2014-07-25  4:30           ` Julian Anastasov
  2014-07-25  7:40             ` Alex Gartrell
  0 siblings, 1 reply; 10+ messages in thread
From: Julian Anastasov @ 2014-07-25  4:30 UTC (permalink / raw)
  To: Alex Gartrell; +Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team


	Hello,

On Thu, 24 Jul 2014, Alex Gartrell wrote:

> So I've found a patch that addresses the problem in what I suspect is the
> right way.
> 
> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> index 6f70bdd..ea7ef5e 100644
> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> @@ -486,6 +486,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
>         if (ret == NF_ACCEPT) {
>                 nf_reset(skb);
>                 skb_forward_csum(skb);
> +               skb->encapsulation = 1;
>         }
>         return ret;
>  }
> 
> Empirically, this solves my problem :) I believe it solves the problem more
> generally as well.
> 
> Digging deeper into the v6 case (FB <3's v6)
> 
> ip_vs_tunnel_xmit_v6
> -> ip6_local_out
> -> __ip6_local_out
> -> dst_output
> -> dst_output_sk
> -> skb_dst(skb)->output (= ip6_output)
> -> ip6_finish_output
> -> ip6->finish_output2
> -> neigh_direct_output
> -> dev_queue_xmit
> -> __dev_queue_xmit
> -> dev_hard_start
> 
> And then we run into this:
> 
>         /* If encapsulation offload request, verify we are testing
>          * hardware encapsulation features instead of standard
>          * features for the netdev
>          */
>         if (skb->encapsulation)
>                 features &= dev->hw_enc_features;
> 
> This essentially strips out NETIF_F_ALL_CSUM, so we end up invoking
> skb_checksum_help below.

	For the case with missing NETIF_F_ALL_CSUM bits
skb_checksum_help() does not care for skb->encapsulation,
it works only by checking skb->csum_* fields. The
skb_set_inner_transport_header() and skb_set_transport_header()
calls simply update the header pointers for the drivers
that have NETIF_F_ALL_CSUM bits set.

	What is your driver? I have to check why do you
have problem when NETIF_F_ALL_CSUM is not set. Is it
missing the NETIF_F_IP_CSUM (v4) and NETIF_F_IPV6_CSUM (v6) bits?

	Also, can you clarify again which test has
the problem with broken csums? All 3 tests? Or it
depends on enabled HW csums?

>         /* If packet is not checksummed and device does not
>          * support checksumming for this protocol, complete
>          * checksumming here.
>          */
>         if (skb->ip_summed == CHECKSUM_PARTIAL) {
>                 if (skb->encapsulation)
>                         skb_set_inner_transport_header(skb,
>                                 skb_checksum_start_offset(skb));
>                 else
>                         skb_set_transport_header(skb,
>                                 skb_checksum_start_offset(skb));
>                 if (!(features & NETIF_F_ALL_CSUM) &&
>                      skb_checksum_help(skb))
>                         goto out_kfree_skb;
>         }
> 
> Does this seem like a reasonable approach to you, Julian?

	Exactly, I was referring to this place. For IPv4
we do the same as your change in ip_vs_tunnel_xmit_prepare,
only that it is via iptunnel_handle_offloads() which in
addition to setting skb->encapsulation adds SKB_GSO_IPIP bit
for GSO purposes but skb_checksum_help() is called again
in dev_hard_start_xmit().

	Don't forget that the devices with NETIF_F_ALL_CSUM set
need skb_reset_inner_headers() call, they use the
skb->inner_* fields. What I have done in the patch is
just to copy the handling from ip*_tunnel*.c. When
NETIF_F_ALL_CSUM is not set we call skb_checksum_help() and
then the device driver detects CHECKSUM_NONE, not
CHECKSUM_PARTIAL.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-25  4:30           ` Julian Anastasov
@ 2014-07-25  7:40             ` Alex Gartrell
  2014-07-25  8:53               ` Julian Anastasov
  0 siblings, 1 reply; 10+ messages in thread
From: Alex Gartrell @ 2014-07-25  7:40 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team

Hey

On 7/24/14, 9:30 PM, "Julian Anastasov" <ja@ssi.bg> wrote:

>
>	Hello,
>
>On Thu, 24 Jul 2014, Alex Gartrell wrote:
>
>> So I've found a patch that addresses the problem in what I suspect is
>>the
>> right way.
>> 
>> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c
>>b/net/netfilter/ipvs/ip_vs_xmit.c
>> index 6f70bdd..ea7ef5e 100644
>> --- a/net/netfilter/ipvs/ip_vs_xmit.c
>> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
>> @@ -486,6 +486,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct
>>sk_buff *skb,
>>         if (ret == NF_ACCEPT) {
>>                 nf_reset(skb);
>>                 skb_forward_csum(skb);
>> +               skb->encapsulation = 1;
>>         }
>>         return ret;
>>  }
>> 
>> Empirically, this solves my problem :) I believe it solves the problem
>>more
>> generally as well.
>> 
>> Digging deeper into the v6 case (FB <3's v6)
>> 
>> ip_vs_tunnel_xmit_v6
>> -> ip6_local_out
>> -> __ip6_local_out
>> -> dst_output
>> -> dst_output_sk
>> -> skb_dst(skb)->output (= ip6_output)
>> -> ip6_finish_output
>> -> ip6->finish_output2
>> -> neigh_direct_output
>> -> dev_queue_xmit
>> -> __dev_queue_xmit
>> -> dev_hard_start
>> 
>> And then we run into this:
>> 
>>         /* If encapsulation offload request, verify we are testing
>>          * hardware encapsulation features instead of standard
>>          * features for the netdev
>>          */
>>         if (skb->encapsulation)
>>                 features &= dev->hw_enc_features;
>> 
>> This essentially strips out NETIF_F_ALL_CSUM, so we end up invoking
>> skb_checksum_help below.
>
>	For the case with missing NETIF_F_ALL_CSUM bits
>skb_checksum_help() does not care for skb->encapsulation,
>it works only by checking skb->csum_* fields. The
>skb_set_inner_transport_header() and skb_set_transport_header()
>calls simply update the header pointers for the drivers
>that have NETIF_F_ALL_CSUM bits set.
>
>	What is your driver? I have to check why do you
>have problem when NETIF_F_ALL_CSUM is not set. Is it
>missing the NETIF_F_IP_CSUM (v4) and NETIF_F_IPV6_CSUM (v6) bits?

The driver in question is mlx4_en version 2.2-1

In this case, we have MLX4_TUNNEL_OFFLOAD_MODE turned off, so
hw_enc_features is set to 0.


>	Also, can you clarify again which test has
>the problem with broken csums? All 3 tests? Or it
>depends on enabled HW csums?

We run into problems with packets captured from NF_INET_LOCAL_OUT when
we have hardware checksumming enabled.  Disabling hardware checksumming
fixes the problem by removing the bits from the features and makes it
act roughly the same as just setting the encapsulation bit

From my reading, the pseudo code for it is basically as follows

Features = device_features # which has a bunch of stuff, including full
checksumming

If skb->encapsulated:
    Features = device_encapsulated_features # which is zero

If needs_gso(skb, features) # requires that gso params are set and that
the features have it
    # gso segment stuff
Else
    If skb->ip_summed == CHECKSUM_PARTIAL: # This is us
         If skb->encapsulation:
             skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb))
         Else
             set_transport_header(skb, skb_checksum_start_offset(skb))
         If !(features & NETIF_F_ALL_CSUM): # Basically equal to
skb->encapsulation for us
             If skb_checksum_help(skb) # will do the software checksumming
we need
                    # do error stuff


>
>>         /* If packet is not checksummed and device does not
>>          * support checksumming for this protocol, complete
>>          * checksumming here.
>>          */
>>         if (skb->ip_summed == CHECKSUM_PARTIAL) {
>>                 if (skb->encapsulation)
>>                         skb_set_inner_transport_header(skb,
>>                                 skb_checksum_start_offset(skb));
>>                 else
>>                         skb_set_transport_header(skb,
>>                                 skb_checksum_start_offset(skb));
>>                 if (!(features & NETIF_F_ALL_CSUM) &&
>>                      skb_checksum_help(skb))
>>                         goto out_kfree_skb;
>>         }
>> 
>> Does this seem like a reasonable approach to you, Julian?
>
>	Exactly, I was referring to this place. For IPv4
>we do the same as your change in ip_vs_tunnel_xmit_prepare,
>only that it is via iptunnel_handle_offloads() which in
>addition to setting skb->encapsulation adds SKB_GSO_IPIP bit
>for GSO purposes but skb_checksum_help() is called again
>in dev_hard_start_xmit().
>
>	Don't forget that the devices with NETIF_F_ALL_CSUM set
>need skb_reset_inner_headers() call, they use the
>skb->inner_* fields. What I have done in the patch is
>just to copy the handling from ip*_tunnel*.c. When
>NETIF_F_ALL_CSUM is not set we call skb_checksum_help() and
>then the device driver detects CHECKSUM_NONE, not
>CHECKSUM_PARTIAL.


Taking a second look, I think that the problem with my approach is
that it won¹t take advantage of gso offload for ipip encapsulation.
Strictly speaking, I don¹t think we need it (the volume of ipvs
Traffic originating from the host itself is almost always going to
be negligible).  That said, taking advantage is clearly the right
way to do it.

With that in mind, my only problem with your patch is that ipv4 is
inconsistent with ipv6.  Can we do something like this?  (Apologies,
I¹m stuck with outlook at home and it is ruining the formatting)  The
specific change is that we use the same iptunnel_handle_offloads
Invocation in ipv4 and ipv6.  It¹s disconcerting to me that it¹s unclear
if we can do IP6IP6 with gso offload ‹ any thoughts there?

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c
b/net/netfilter/ipvs/ip_vs_xmit.c
index 6f70bdd..b62a23e 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -37,6 +37,7 @@
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
+#include <net/ip_tunnels.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <linux/icmpv6.h>
@@ -862,6 +863,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct
ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	if (IS_ERR(skb))
+		goto tx_error;
+
 	skb->transport_header = skb->network_header;
 
 	/* fix old IP header checksum */
@@ -900,7 +905,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct
ip_vs_conn *cp,
 	return NF_STOLEN;
 
   tx_error:
-	kfree_skb(skb);
+	if (!IS_ERR(skb))
+		kfree_skb(skb);
 	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -953,6 +959,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct
ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
+	skb = iptunnel_handle_offloads(skb, false, 0);
+	if (IS_ERR(skb))
+		goto tx_error;
+
 	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
@@ -988,7 +998,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct
ip_vs_conn *cp,
 	return NF_STOLEN;
 
 tx_error:
-	kfree_skb(skb);
+	if (!IS_ERR(skb))
+		kfree_skb(skb);
 	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit
  2014-07-25  7:40             ` Alex Gartrell
@ 2014-07-25  8:53               ` Julian Anastasov
  0 siblings, 0 replies; 10+ messages in thread
From: Julian Anastasov @ 2014-07-25  8:53 UTC (permalink / raw)
  To: Alex Gartrell; +Cc: horms@verge.net.au, lvs-devel@vger.kernel.org, kernel-team

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3135 bytes --]


	Hello,

On Fri, 25 Jul 2014, Alex Gartrell wrote:

> The driver in question is mlx4_en version 2.2-1
> 
> In this case, we have MLX4_TUNNEL_OFFLOAD_MODE turned off, so
> hw_enc_features is set to 0.

	ok

> >	Also, can you clarify again which test has
> >the problem with broken csums? All 3 tests? Or it
> >depends on enabled HW csums?
> 
> We run into problems with packets captured from NF_INET_LOCAL_OUT when
> we have hardware checksumming enabled.  Disabling hardware checksumming
> fixes the problem by removing the bits from the features and makes it
> act roughly the same as just setting the encapsulation bit

	ok, so it matches my understanding. I was
afraid you have problems with HW csum disabled.

> >From my reading, the pseudo code for it is basically as follows
> 
> Features = device_features # which has a bunch of stuff, including full
> checksumming
> 
> If skb->encapsulated:
>     Features = device_encapsulated_features # which is zero
> 
> If needs_gso(skb, features) # requires that gso params are set and that
> the features have it
>     # gso segment stuff
> Else
>     If skb->ip_summed == CHECKSUM_PARTIAL: # This is us
>          If skb->encapsulation:
>              skb_set_inner_transport_header(skb,
> skb_checksum_start_offset(skb))
>          Else
>              set_transport_header(skb, skb_checksum_start_offset(skb))
>          If !(features & NETIF_F_ALL_CSUM): # Basically equal to
> skb->encapsulation for us
>              If skb_checksum_help(skb) # will do the software checksumming
> we need
>                     # do error stuff

	I understand all this, that is the reason for
my patch.

> Taking a second look, I think that the problem with my approach is
> that it won¹t take advantage of gso offload for ipip encapsulation.
> Strictly speaking, I don¹t think we need it (the volume of ipvs
> Traffic originating from the host itself is almost always going to
> be negligible).  That said, taking advantage is clearly the right
> way to do it.
> 
> With that in mind, my only problem with your patch is that ipv4 is
> inconsistent with ipv6.  Can we do something like this?  (Apologies,
> I¹m stuck with outlook at home and it is ruining the formatting)  The
> specific change is that we use the same iptunnel_handle_offloads
> Invocation in ipv4 and ipv6.  It¹s disconcerting to me that it¹s unclear
> if we can do IP6IP6 with gso offload ‹ any thoughts there?

	That is my doubt too. iptunnel_handle_offloads is
called only for IPv4 tunnels, even in net/ipv6/. OTOH,
ip6_tnl_xmit2() does everything directly, that is why
I choosed the same thing for IPv6. I don't see value to use
instead of SKB_GSO_IPIP. Looking at ipv4_offload_init()
for IPv4 there is offload support for TCP, UDP and IPIP.
ipv6_offload_init() shows support for TCP, UDP,
via ipv6_exthdrs_offload_init for IPPROTO_ROUTING and
IPPROTO_DSTOPTS), sit_offload (IPPROTO_IPV6). So, it
may be possible to use iptunnel_handle_offloads for IPv6.

	We may need help from experts in this area. So,
I'll post the patch as RFC on netdev after little
research for GRO/GSO.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-07-25  8:53 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-07-23  6:37 [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Alex Gartrell
2014-07-23  6:37 ` [PATCH ipvs 2/2] ipvs: only perform slow checksum on NF_INET_LOCAL_OUT Alex Gartrell
2014-07-23  8:25 ` [PATCH ipvs 1/2] ipvs: Do tcp/udp checksumming prior to tunnel xmit Julian Anastasov
2014-07-23 19:16   ` Alex Gartrell
2014-07-23 22:54     ` Julian Anastasov
2014-07-24  6:22       ` Julian Anastasov
2014-07-24 22:29         ` Alex Gartrell
2014-07-25  4:30           ` Julian Anastasov
2014-07-25  7:40             ` Alex Gartrell
2014-07-25  8:53               ` Julian Anastasov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.