* [PATCH net-next V3 1/3] net-next: use five-tuple hash for sk_txhash
2017-12-01 23:31 [PATCH net-next V3 0/3] net: fix flowlabel inconsistency in reset packet Shaohua Li
@ 2017-12-01 23:31 ` Shaohua Li
2017-12-01 23:31 ` [PATCH net-next V3 2/3] net-next: copy user configured flowlabel to reset packet Shaohua Li
2017-12-01 23:31 ` [PATCH net-next V3 3/3] net: add a sysctl to make auto flowlabel consistent Shaohua Li
2 siblings, 0 replies; 7+ messages in thread
From: Shaohua Li @ 2017-12-01 23:31 UTC (permalink / raw)
To: netdev, davem; +Cc: kafai, eric.dumazet, flo, xiyou.wangcong, tom, Shaohua Li
From: Shaohua Li <shli@fb.com>
We are using sk_txhash to calculate flowlabel, but sk_txhash isn't
always available, for example, in inet_timewait_sock. This causes
problem for reset packet, which will have a different flowlabel. This
causes our router doesn't correctly close tcp connection. We are using
flowlabel to do load balance. Routers in the path maintain connection
state. So if flow label changes, the packet is routed through a
different router. In this case, the old router doesn't get the reset
packet to close the tcp connection.
Per Tom's suggestion, we switch back to five-tuple hash, so we can
reconstruct correct flowlabel for reset packet.
At most places, we already have the flowi info, so we directly use it
build sk_txhash. For synack, we do this after route search. At that
time, we have the flowi info ready, so don't need to create the flowi
info again.
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Florent Fourcot <flo@fourcot.fr>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
include/linux/tcp.h | 5 +++++
include/net/sock.h | 17 ++++++-----------
include/net/tcp.h | 2 +-
net/ipv4/datagram.c | 2 +-
net/ipv4/syncookies.c | 4 +++-
net/ipv4/tcp_input.c | 1 -
net/ipv4/tcp_ipv4.c | 18 +++++++++++++-----
net/ipv4/tcp_output.c | 1 -
net/ipv6/datagram.c | 4 +++-
net/ipv6/syncookies.c | 3 ++-
net/ipv6/tcp_ipv6.c | 19 ++++++++++++++-----
11 files changed, 48 insertions(+), 28 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index df5d97a..227e8b2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -139,6 +139,11 @@ struct tcp_request_sock {
*/
};
+static inline void tcp_rsk_set_txhash(struct tcp_request_sock *rsk, u32 hash)
+{
+ rsk->txhash = hash;
+}
+
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
{
return (struct tcp_request_sock *)req;
diff --git a/include/net/sock.h b/include/net/sock.h
index 79e1a2c..b9cb9d2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1729,22 +1729,17 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
}
-static inline u32 net_tx_rndhash(void)
+static inline void sk_set_txhash(struct sock *sk, u32 hash)
{
- u32 v = prandom_u32();
-
- return v ?: 1;
-}
-
-static inline void sk_set_txhash(struct sock *sk)
-{
- sk->sk_txhash = net_tx_rndhash();
+ sk->sk_txhash = hash;
}
static inline void sk_rethink_txhash(struct sock *sk)
{
- if (sk->sk_txhash)
- sk_set_txhash(sk);
+ if (sk->sk_txhash) {
+ u32 v = prandom_u32();
+ sk->sk_txhash = v ?: 1;
+ }
}
static inline struct dst_entry *
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4e09398..a5c28be 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1840,7 +1840,7 @@ struct tcp_request_sock_ops {
__u16 *mss);
#endif
struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
- const struct request_sock *req);
+ struct request_sock *req);
u32 (*init_seq)(const struct sk_buff *skb);
u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abf..1f2f9fc 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
- sk_set_txhash(sk);
+ sk_set_txhash(sk, get_hash_from_flowi4(fl4));
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index fda37f2..ecf6e7a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -335,7 +335,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
treq->ts_off = 0;
- treq->txhash = net_tx_rndhash();
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
@@ -376,6 +375,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+
+ tcp_rsk_set_txhash(treq, get_hash_from_flowi4(&fl4));
+
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
reqsk_free(req);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 734cfc8..e886c28 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6288,7 +6288,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
tcp_rsk(req)->snt_isn = isn;
- tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c6bc0c4..8c47870 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err)
goto failure;
- sk_set_txhash(sk);
+ sk_set_txhash(sk, get_hash_from_flowi4(fl4));
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
inet->inet_sport, inet->inet_dport, sk);
@@ -871,8 +871,12 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct sk_buff *skb;
/* First, grab a route. */
- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
- return -1;
+ if (!dst) {
+ if ((dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ return -1;
+
+ tcp_rsk_set_txhash(tcp_rsk(req), get_hash_from_flowi4(&fl4));
+ }
skb = tcp_make_synack(sk, dst, req, foc, synack_type);
@@ -1278,9 +1282,13 @@ static void tcp_v4_init_req(struct request_sock *req,
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req)
{
- return inet_csk_route_req(sk, &fl->u.ip4, req);
+ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
+ if (dst)
+ tcp_rsk_set_txhash(tcp_rsk(req),
+ get_hash_from_flowi4(&fl->u.ip4));
+ return dst;
}
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c..94e56cb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3752,7 +3752,6 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl;
int res;
- tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a1f9187..003af09 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -150,6 +150,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_type;
int err;
__be32 fl6_flowlabel = 0;
+ struct flowi6 fl6;
if (usin->sin6_family == AF_INET) {
if (__ipv6_only_sock(sk))
@@ -260,7 +261,8 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
}
sk->sk_state = TCP_ESTABLISHED;
- sk_set_txhash(sk);
+ ip6_datagram_flow_key_init(&fl6, sk);
+ sk_set_txhash(sk, get_hash_from_flowi6(&fl6));
out:
return err;
}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index e7a3a6b..6897aca 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -216,7 +216,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
treq->ts_off = 0;
- treq->txhash = net_tx_rndhash();
/*
* We need to lookup the dst_entry to get the correct window size.
@@ -238,6 +237,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ tcp_rsk_set_txhash(treq, get_hash_from_flowi6(&fl6));
+
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst))
goto out_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6bb98c9..1e4ce06 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -286,7 +286,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err)
goto late_failure;
- sk_set_txhash(sk);
+ sk_set_txhash(sk, get_hash_from_flowi6(&fl6));
if (likely(!tp->repair)) {
if (!tp->write_seq)
@@ -470,9 +470,13 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
int err = -ENOMEM;
/* First, grab a route. */
- if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
+ if (!dst) {
+ if ((dst = inet6_csk_route_req(sk, fl6, req,
IPPROTO_TCP)) == NULL)
- goto done;
+ goto done;
+
+ tcp_rsk_set_txhash(tcp_rsk(req), get_hash_from_flowi6(fl6));
+ }
skb = tcp_make_synack(sk, dst, req, foc, synack_type);
@@ -743,9 +747,14 @@ static void tcp_v6_init_req(struct request_sock *req,
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req)
{
- return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
+ struct dst_entry *dst;
+ dst = inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
+ if (dst)
+ tcp_rsk_set_txhash(tcp_rsk(req),
+ get_hash_from_flowi6(&fl->u.ip6));
+ return dst;
}
struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
--
2.9.5
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH net-next V3 2/3] net-next: copy user configured flowlabel to reset packet
2017-12-01 23:31 [PATCH net-next V3 0/3] net: fix flowlabel inconsistency in reset packet Shaohua Li
2017-12-01 23:31 ` [PATCH net-next V3 1/3] net-next: use five-tuple hash for sk_txhash Shaohua Li
@ 2017-12-01 23:31 ` Shaohua Li
2017-12-01 23:31 ` [PATCH net-next V3 3/3] net: add a sysctl to make auto flowlabel consistent Shaohua Li
2 siblings, 0 replies; 7+ messages in thread
From: Shaohua Li @ 2017-12-01 23:31 UTC (permalink / raw)
To: netdev, davem; +Cc: kafai, eric.dumazet, flo, xiyou.wangcong, tom, Shaohua Li
From: Shaohua Li <shli@fb.com>
Reset packet doesn't use user configured flowlabel, instead, it always
uses 0. This will cause inconsistency for flowlabel. tw sock already
records flowlabel info, so we can directly use it.
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Florent Fourcot <flo@fourcot.fr>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Tom Herbert <tom@herbertland.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
net/ipv6/tcp_ipv6.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1e4ce06..b8383be 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -902,6 +902,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL;
#endif
int oif = 0;
+ u8 tclass = 0;
+ __be32 flowlabel = 0;
if (th->rst)
return;
@@ -955,7 +957,21 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
trace_tcp_send_reset(sk, skb);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+ if (sk) {
+ if (sk_fullsock(sk)) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ tclass = np->tclass;
+ flowlabel = np->flow_label & IPV6_FLOWLABEL_MASK;
+ } else {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+
+ tclass = tw->tw_tclass;
+ flowlabel = cpu_to_be32(tw->tw_flowlabel);
+ }
+ }
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1,
+ tclass, flowlabel);
#ifdef CONFIG_TCP_MD5SIG
out:
--
2.9.5
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH net-next V3 3/3] net: add a sysctl to make auto flowlabel consistent
2017-12-01 23:31 [PATCH net-next V3 0/3] net: fix flowlabel inconsistency in reset packet Shaohua Li
2017-12-01 23:31 ` [PATCH net-next V3 1/3] net-next: use five-tuple hash for sk_txhash Shaohua Li
2017-12-01 23:31 ` [PATCH net-next V3 2/3] net-next: copy user configured flowlabel to reset packet Shaohua Li
@ 2017-12-01 23:31 ` Shaohua Li
2017-12-02 1:14 ` Tom Herbert
` (2 more replies)
2 siblings, 3 replies; 7+ messages in thread
From: Shaohua Li @ 2017-12-01 23:31 UTC (permalink / raw)
To: netdev, davem; +Cc: kafai, eric.dumazet, flo, xiyou.wangcong, tom, Shaohua Li
From: Shaohua Li <shli@fb.com>
Currently if there is negative routing, we change sock's txhash, so the
sock will have a different flowlabel and route to different path.
According to Tom, we'd better to have option to enable this, because some
routers require flowlabel consistent. By default, we maintain consistent
flowlabel, eg, negative routing doesn't change flowlabel.
Suggested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
Documentation/networking/ip-sysctl.txt | 7 +++++++
include/net/netns/ipv6.h | 1 +
include/net/sock.h | 28 +++++++++++++++-------------
net/ipv6/af_inet6.c | 1 +
net/ipv6/sysctl_net_ipv6.c | 8 ++++++++
5 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 46c7e10..14132a0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1345,6 +1345,13 @@ auto_flowlabels - INTEGER
be disabled by the socket option
Default: 1
+consistent_auto_flowlabel - BOOLEAN
+ When auto_flowlabels is enabled, this option makes socket flowlabel
+ consistent in the lifetime.
+ TRUE: enabled
+ FALSE: disabled
+ Default: TRUE
+
flowlabel_state_ranges - BOOLEAN
Split the flow label number space into two ranges. 0-0x7FFFF is
reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 987cc45..e55f851 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -30,6 +30,7 @@ struct netns_sysctl_ipv6 {
int ip6_rt_min_advmss;
int flowlabel_consistency;
int auto_flowlabels;
+ int consistent_auto_flowlabel;
int icmpv6_time;
int anycast_src_echo_reply;
int ip_nonlocal_bind;
diff --git a/include/net/sock.h b/include/net/sock.h
index b9cb9d2..45e868f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1729,6 +1729,18 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
}
+static inline
+struct net *sock_net(const struct sock *sk)
+{
+ return read_pnet(&sk->sk_net);
+}
+
+static inline
+void sock_net_set(struct sock *sk, struct net *net)
+{
+ write_pnet(&sk->sk_net, net);
+}
+
static inline void sk_set_txhash(struct sock *sk, u32 hash)
{
sk->sk_txhash = hash;
@@ -1736,7 +1748,9 @@ static inline void sk_set_txhash(struct sock *sk, u32 hash)
static inline void sk_rethink_txhash(struct sock *sk)
{
- if (sk->sk_txhash) {
+ struct net *net = sock_net(sk);
+
+ if (sk->sk_txhash && !net->ipv6.sysctl.consistent_auto_flowlabel) {
u32 v = prandom_u32();
sk->sk_txhash = v ?: 1;
}
@@ -2291,18 +2305,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
}
-static inline
-struct net *sock_net(const struct sock *sk)
-{
- return read_pnet(&sk->sk_net);
-}
-
-static inline
-void sock_net_set(struct sock *sk, struct net *net)
-{
- write_pnet(&sk->sk_net, net);
-}
-
static inline struct sock *skb_steal_sock(struct sk_buff *skb)
{
if (skb->sk) {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c26f712..fe9b312 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -807,6 +807,7 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.icmpv6_time = 1*HZ;
net->ipv6.sysctl.flowlabel_consistency = 1;
net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
+ net->ipv6.sysctl.consistent_auto_flowlabel = 1;
net->ipv6.sysctl.idgen_retries = 3;
net->ipv6.sysctl.idgen_delay = 1 * HZ;
net->ipv6.sysctl.flowlabel_state_ranges = 0;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index a789a8a..8908092 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -126,6 +126,13 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "consistent_auto_flowlabel",
+ .data = &init_net.ipv6.sysctl.consistent_auto_flowlabel,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
@@ -190,6 +197,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
+ ipv6_table[14].data = &net->ipv6.sysctl.consistent_auto_flowlabel;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
--
2.9.5
^ permalink raw reply related [flat|nested] 7+ messages in thread