From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Eric Dumazet <edumazet@google.com>,
"David S. Miller" <davem@davemloft.net>
Subject: [PATCH 3.10 12/16] ipv4: tcp: get rid of ugly unicast_sock
Date: Tue, 24 Feb 2015 18:10:04 -0800 [thread overview]
Message-ID: <20150225020811.945601333@linuxfoundation.org> (raw)
In-Reply-To: <20150225020811.453644010@linuxfoundation.org>
3.10-stable review patch. If anyone has any objections, please let me know.
------------------
From: Eric Dumazet <edumazet@google.com>
[ Upstream commit bdbbb8527b6f6a358dbcb70dac247034d665b8e4 ]
In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :
commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.
commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.
commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression.
commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.
Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.
This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
include/net/ip.h | 2 +-
include/net/netns/ipv4.h | 1 +
net/ipv4/ip_output.c | 30 +++---------------------------
net/ipv4/tcp_ipv4.c | 37 ++++++++++++++++++++++++++++++++-----
4 files changed, 37 insertions(+), 33 deletions(-)
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -163,7 +163,7 @@ static inline __u8 ip_reply_arg_flowi_fl
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
__be32 saddr, const struct ip_reply_arg *arg,
unsigned int len);
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -43,6 +43,7 @@ struct netns_ipv4 {
struct inet_peer_base *peers;
struct tcpm_hash_bucket *tcp_metrics_hash;
unsigned int tcp_metrics_hash_log;
+ struct sock * __percpu *tcp_sk;
struct netns_frags frags;
#ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter;
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1454,24 +1454,8 @@ static int ip_reply_glue_bits(void *dptr
/*
* Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far.
- *
- * Use a fake percpu inet socket to avoid false sharing and contention.
*/
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
- .sk = {
- .__sk_common = {
- .skc_refcnt = ATOMIC_INIT(1),
- },
- .sk_wmem_alloc = ATOMIC_INIT(1),
- .sk_allocation = GFP_ATOMIC,
- .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
- .sk_pacing_rate = ~0U,
- },
- .pmtudisc = IP_PMTUDISC_WANT,
- .uc_ttl = -1,
-};
-
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
__be32 saddr, const struct ip_reply_arg *arg,
unsigned int len)
{
@@ -1479,9 +1463,8 @@ void ip_send_unicast_reply(struct net *n
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct sk_buff *nskb;
- struct sock *sk;
- struct inet_sock *inet;
int err;
if (ip_options_echo(&replyopts.opt.opt, skb))
@@ -1509,15 +1492,11 @@ void ip_send_unicast_reply(struct net *n
if (IS_ERR(rt))
return;
- inet = &get_cpu_var(unicast_sock);
+ inet_sk(sk)->tos = arg->tos;
- inet->tos = arg->tos;
- sk = &inet->sk;
sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
- sock_net_set(sk, net);
- __skb_queue_head_init(&sk->sk_write_queue);
sk->sk_sndbuf = sysctl_wmem_default;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1533,13 +1512,10 @@ void ip_send_unicast_reply(struct net *n
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_orphan(nskb);
skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out:
- put_cpu_var(unicast_sock);
-
ip_rt_put(rt);
}
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -707,7 +707,8 @@ static void tcp_v4_send_reset(struct soc
net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos;
- ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -790,7 +791,8 @@ static void tcp_v4_send_ack(struct sk_bu
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
- ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -2898,14 +2900,39 @@ struct proto tcp_prot = {
};
EXPORT_SYMBOL(tcp_prot);
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+ free_percpu(net->ipv4.tcp_sk);
+}
+
static int __net_init tcp_sk_init(struct net *net)
{
+ int res, cpu;
+
+ net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+ if (!net->ipv4.tcp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, net);
+ if (res)
+ goto fail;
+ *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+ }
net->ipv4.sysctl_tcp_ecn = 2;
return 0;
-}
-static void __net_exit tcp_sk_exit(struct net *net)
-{
+fail:
+ tcp_sk_exit(net);
+
+ return res;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
next prev parent reply other threads:[~2015-02-25 2:28 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-02-25 2:09 [PATCH 3.10 00/16] 3.10.70-stable review Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 01/16] ip: zero sockaddr returned on error queue Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 02/16] net: rps: fix cpu unplug Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 03/16] ipv6: stop sending PTB packets for MTU < 1280 Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 04/16] netxen: fix netxen_nic_poll() logic Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 05/16] net: sctp: fix slab corruption from use after free on INIT collisions Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 06/16] ipv4: try to cache dst_entries which would cause a redirect Greg Kroah-Hartman
2015-02-25 2:09 ` [PATCH 3.10 07/16] udp_diag: Fix socket skipping within chain Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 08/16] ping: Fix race in free in receive path Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 09/16] ipv6: replacing a rt6_info needs to purge possible propagated rt6_infos too Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 10/16] bridge: dont send notification when skb->len == 0 in rtnl_bridge_notify Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 11/16] tcp: ipv4: initialize unicast_sock sk_pacing_rate Greg Kroah-Hartman
2015-02-25 2:10 ` Greg Kroah-Hartman [this message]
2015-02-25 2:10 ` [PATCH 3.10 13/16] ppp: deflate: never return len larger than output buffer Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 14/16] net: sctp: fix passing wrong parameter header to param_type2af in sctp_process_param Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 15/16] [media] media/rc: Send sync space information on the lirc device Greg Kroah-Hartman
2015-02-25 2:10 ` [PATCH 3.10 16/16] rbd: drop an unsafe assertion Greg Kroah-Hartman
2015-02-25 16:44 ` [PATCH 3.10 00/16] 3.10.70-stable review Guenter Roeck
2015-02-25 20:55 ` Shuah Khan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150225020811.945601333@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.