From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jesper Dangaard Brouer Subject: [RFC v2 PATCH 2/3] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods Date: Thu, 31 May 2012 15:40:03 +0200 Message-ID: <20120531134003.10311.14051.stgit@localhost.localdomain> References: <20120531133807.10311.79711.stgit@localhost.localdomain> Mime-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Cc: Florian Westphal , Hans Schillstrom To: Jesper Dangaard Brouer , netdev@vger.kernel.org, Christoph Paasch , Eric Dumazet , "David S. Miller" , Martin Topholm Return-path: Received: from 0304ds2-fs.1.fullrate.dk ([89.150.128.48]:11040 "EHLO firesoul.localdomain" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1758032Ab2EaNhn (ORCPT ); Thu, 31 May 2012 09:37:43 -0400 In-Reply-To: <20120531133807.10311.79711.stgit@localhost.localdomain> Sender: netdev-owner@vger.kernel.org List-ID: TCP SYN handling is on the slow path via tcp_v4_rcv(), and is performed while holding spinlock bh_lock_sock(). Real-life and testlab experiments show, that the kernel choks when reaching 130Kpps SYN floods (powerful Nehalem 16 cores). Measuring with perf reveals, that its caused by bh_lock_sock_nested() call in tcp_v4_rcv(). With this patch, the machine can handle 750Kpps (max of the SYN flood generator) with cycles to spare, CPU load on the big machine dropped to 1%, from 100%. Notice we only handle syn cookie early on, normal SYN packets are still processed under the bh_lock_sock(). V2: - Check for existing connection request (reqsk) - Avoid (unlikely) variable race in tcp_make_synack for tcp_full_space(sk) Signed-off-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer --- net/ipv4/tcp_ipv4.c | 48 +++++++++++++++++++++++++++++++++++++++++------- net/ipv4/tcp_output.c | 20 ++++++++++++++------ 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ed9d35a..29e9c4a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1274,8 +1274,10 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { */ int tcp_v4_syn_conn_limit(struct sock *sk, struct sk_buff *skb) { - struct request_sock *req; + struct request_sock *req = NULL; struct inet_request_sock *ireq; + struct request_sock *exist_req; + struct request_sock **prev; struct tcp_options_received tmp_opt; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; @@ -1290,7 +1292,10 @@ int tcp_v4_syn_conn_limit(struct sock *sk, struct sk_buff *skb) if (isn) goto no_limit; - /* Start sending SYN cookies when request sock queue is full*/ + /* Start sending SYN cookies when request sock queue is full + * - Should lock while full queue check, but we don't need + * that precise/exact threshold here. + */ if (!inet_csk_reqsk_queue_is_full(sk)) goto no_limit; @@ -1300,6 +1305,29 @@ int tcp_v4_syn_conn_limit(struct sock *sk, struct sk_buff *skb) if (!tcp_syn_flood_action(sk, skb, "TCP")) goto drop; /* Not enabled, indicate drop, due to queue full */ + /* Check for existing connection request (reqsk) as this might + * be a retransmitted SYN which have gotten into the + * reqsk_queue. If so, we choose to drop the reqsk, and use + * SYN cookies to restore the state later, even-though this + * can cause issues, if the original SYN/ACK didn't get + * dropped, but somehow were delayed in the network and the + * SYN-retransmission timer on the client-side fires before + * the SYN/ACK reaches the client. We choose to neglect + * this situation as we are under attack, and don't want to + * open an attack vector, of falling back to the slow locked + * path. + */ + bh_lock_sock(sk); + exist_req = inet_csk_search_req(sk, &prev, tcp_hdr(skb)->source, saddr, daddr); + if (exist_req) { /* Drop existing reqsk */ + if (TCP_SKB_CB(skb)->seq == tcp_rsk(exist_req)->rcv_isn) + net_warn_ratelimited("Retransmitted SYN from %pI4" + " (orig reqsk dropped)", &saddr); + + inet_csk_reqsk_queue_drop(sk, exist_req, prev); + } + bh_unlock_sock(sk); + /* Allocate a request_sock */ req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) { @@ -1331,6 +1359,7 @@ int tcp_v4_syn_conn_limit(struct sock *sk, struct sk_buff *skb) ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); + /* Considering lock here, cannot determine security module behavior */ if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -1345,7 +1374,10 @@ int tcp_v4_syn_conn_limit(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; - /* Send SYN-ACK containing cookie */ + /* Send SYN-ACK containing cookie + * - tcp_v4_send_synack() handles alloc of a dst route cache, + * but also releases it immediately afterwards + */ tcp_v4_send_synack(sk, NULL, req, NULL); drop_and_free: @@ -1382,10 +1414,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - /* SYN cookie handling */ - if (tcp_v4_syn_conn_limit(sk, skb)) - goto drop; - req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; @@ -1792,6 +1820,12 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; + /* Early and parallel SYN limit check, that sends syncookies */ + if (sk->sk_state == TCP_LISTEN && th->syn && !th->ack && !th->fin) { + if (tcp_v4_syn_conn_limit(sk, skb)) + goto discard_and_relse; + } + process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 803cbfe..81fd4fc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2458,6 +2458,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; int s_data_desired = 0; + int tcp_full_space_val; if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) s_data_desired = cvp->s_data_desired; @@ -2479,13 +2480,16 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + /* Instruct compiler not do additional loads */ + ACCESS_ONCE(tcp_full_space_val) = tcp_full_space(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) - req->window_clamp = tcp_full_space(sk); + (req->window_clamp > tcp_full_space_val || req->window_clamp == 0)) + req->window_clamp = tcp_full_space_val; /* tcp_full_space because it is guaranteed to be the first packet */ - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(tcp_full_space_val, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rcv_wnd, &req->window_clamp, @@ -2582,6 +2586,7 @@ void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + int tcp_full_space_val; __u8 rcv_wscale; /* We'll fix this up when we get a response from the other end. @@ -2610,12 +2615,15 @@ void tcp_connect_init(struct sock *sk) tcp_initialize_rcv_mss(sk); + /* Instruct compiler not do additional loads */ + ACCESS_ONCE(tcp_full_space_val) = tcp_full_space(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && - (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); + (tp->window_clamp > tcp_full_space_val || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space_val; - tcp_select_initial_window(tcp_full_space(sk), + tcp_select_initial_window(tcp_full_space_val, tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp,