* [RFC v2 PATCH 1/3] tcp: extract syncookie part of tcp_v4_conn_request()
From: Jesper Dangaard Brouer @ 2012-05-31 13:39 UTC (permalink / raw)
To: Jesper Dangaard Brouer, netdev, Christoph Paasch, Eric Dumazet,
David S. Miller, Martin Topholm
Cc: Florian Westphal, Hans Schillstrom
In-Reply-To: <20120531133807.10311.79711.stgit@localhost.localdomain>
From: Jesper Dangaard Brouer <jbrouer@redhat.com>
Place SYN cookie handling, from tcp_v4_conn_request() into seperate
function, named tcp_v4_syn_conn_limit(). The semantics should be
almost the same.
Besides code cleanup, this patch is preparing for handling SYN cookie
in an ealier step, to avoid a spinlock and achive parallel processing.
Signed-off-by: Martin Topholm <mph@hoth.dk>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
net/ipv4/tcp_ipv4.c | 122 +++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 98 insertions(+), 24 deletions(-)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a43b87d..ed9d35a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1268,6 +1268,95 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
};
#endif
+/* Check SYN connect limit and send SYN-ACK cookies
+ * - Return 0 = No limitation needed, continue processing
+ * - Return 1 = Stop processing, free SKB, SYN cookie send (if enabled)
+ */
+int tcp_v4_syn_conn_limit(struct sock *sk, struct sk_buff *skb)
+{
+ struct request_sock *req;
+ struct inet_request_sock *ireq;
+ struct tcp_options_received tmp_opt;
+ __be32 saddr = ip_hdr(skb)->saddr;
+ __be32 daddr = ip_hdr(skb)->daddr;
+ __u32 isn = TCP_SKB_CB(skb)->when;
+ const u8 *hash_location; /* No really used */
+
+ /* Never answer to SYNs send to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ /* If "isn" is not zero, this request hit alive timewait bucket */
+ if (isn)
+ goto no_limit;
+
+ /* Start sending SYN cookies when request sock queue is full*/
+ if (!inet_csk_reqsk_queue_is_full(sk))
+ goto no_limit;
+
+ /* Check if SYN cookies are enabled
+ * - Side effect: NET_INC_STATS_BH counters + printk logging
+ */
+ if (!tcp_syn_flood_action(sk, skb, "TCP"))
+ goto drop; /* Not enabled, indicate drop, due to queue full */
+
+ /* Allocate a request_sock */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops);
+ if (!req) {
+ net_warn_ratelimited ("%s: Could not alloc request_sock"
+ ", drop conn from %pI4",
+ __func__, &saddr);
+ goto drop;
+ }
+
+#ifdef CONFIG_TCP_MD5SIG
+ tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
+#endif
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+ tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+ if (!tmp_opt.saw_tstamp)
+ tcp_clear_options(&tmp_opt);
+
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+ tcp_openreq_init(req, &tmp_opt, skb);
+
+ /* Update req as an inet_request_sock (typecast trick)*/
+ ireq = inet_rsk(req);
+ ireq->loc_addr = daddr;
+ ireq->rmt_addr = saddr;
+ ireq->no_srccheck = inet_sk(sk)->transparent;
+ ireq->opt = tcp_v4_save_options(sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ /* Cookie support for ECN if TCP timestamp option avail */
+ if (tmp_opt.tstamp_ok)
+ TCP_ECN_create_request(req, skb);
+
+ /* Encode cookie in InitialSeqNum of SYN-ACK packet */
+ isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
+
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+
+ /* Send SYN-ACK containing cookie */
+ tcp_v4_send_synack(sk, NULL, req, NULL);
+
+drop_and_free:
+ reqsk_free(req);
+drop:
+ return 1;
+no_limit:
+ return 0;
+}
+
+/* Handle SYN request */
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_extend_values tmp_ext;
@@ -1280,22 +1369,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- bool want_cookie = false;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
- if (!want_cookie)
- goto drop;
- }
-
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
@@ -1304,6 +1382,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
+ /* SYN cookie handling */
+ if (tcp_v4_syn_conn_limit(sk, skb))
+ goto drop;
+
req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
@@ -1317,6 +1399,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tmp_opt.user_mss = tp->rx_opt.user_mss;
tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ /* Handle RFC6013 - TCP Cookie Transactions (TCPCT) options */
if (tmp_opt.cookie_plus > 0 &&
tmp_opt.saw_tstamp &&
!tp->rx_opt.cookie_out_never &&
@@ -1339,7 +1422,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
while (l-- > 0)
*c++ ^= *hash_location++;
- want_cookie = false; /* not our kind of cookie */
tmp_ext.cookie_out_never = 0; /* false */
tmp_ext.cookie_plus = tmp_opt.cookie_plus;
} else if (!tp->rx_opt.cookie_in_always) {
@@ -1351,12 +1433,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
}
tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
- if (want_cookie && !tmp_opt.saw_tstamp)
- tcp_clear_options(&tmp_opt);
-
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
+ /* Update req as an inet_request_sock (typecast trick)*/
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
@@ -1366,13 +1446,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
- if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb);
+ TCP_ECN_create_request(req, skb);
- if (want_cookie) {
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
- } else if (!isn) {
+ if (!isn) {
struct inet_peer *peer = NULL;
struct flowi4 fl4;
@@ -1422,8 +1498,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_rsk(req)->snt_synack = tcp_time_stamp;
if (tcp_v4_send_synack(sk, dst, req,
- (struct request_values *)&tmp_ext) ||
- want_cookie)
+ (struct request_values *)&tmp_ext))
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1438,7 +1513,6 @@ drop:
}
EXPORT_SYMBOL(tcp_v4_conn_request);
-
/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
^ permalink raw reply related
* [RFC v2 PATCH 0/3] tcp: Parallel SYN brownies patch series to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-31 13:39 UTC (permalink / raw)
To: Jesper Dangaard Brouer, netdev, Christoph Paasch, Eric Dumazet,
David S. Miller, Martin Topholm
Cc: Florian Westphal, Hans Schillstrom
The following series is dubbed SYN brownies. The purpose is mitigate
the effect of SYN flood DDoS attacks. This is done by making the SYN
cookies stage parallel. In normal (non-overload) situations SYN
packets are still processed under the bh_lock_sock().
This SYN brownies patch series will not be merged right away, as Eric
Dumazet is working on a fully parallel SYN stage. Until that emerges
and gets integrated, I recommend people with SYN flood issues, to use
these patches to fix your immediate overload situations.
Thus, these patches can only be merged at Eric Dumazet's will/ACK, if
he determines they don't conflict with his work.
Only IPv4 TCP is handled here. The IPv6 TCP code also need to be
updated, but I'll deal with that part after, Eric Dumazet, have
settled on a fully parallel SYN processing stage.
This is patch set have been tested on top Linus'es tree of
commit v3.4-9209-gd590f9a.
---
Jesper Dangaard Brouer (3):
tcp: SYN retransmits, fallback to slow-locked/no-cookie path
tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
tcp: extract syncookie part of tcp_v4_conn_request()
net/ipv4/tcp_ipv4.c | 154 +++++++++++++++++++++++++++++++++++++++++--------
net/ipv4/tcp_output.c | 20 ++++--
2 files changed, 144 insertions(+), 30 deletions(-)
^ permalink raw reply
* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-31 13:24 UTC (permalink / raw)
To: Eric Dumazet
Cc: christoph.paasch, netdev, David S. Miller, Martin Topholm,
Florian Westphal, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338469811.2760.1345.camel@edumazet-glaptop>
On Thu, 2012-05-31 at 15:10 +0200, Eric Dumazet wrote:
> On Thu, 2012-05-31 at 14:58 +0200, Eric Dumazet wrote:
>
> >
> > How many different IP addresses are used by your generator ?
> >
> > Or maybe you disabled IP route cache ?
>
> With no route cache problems, I sustain 4 us per SYN packet, if all load
> serviced by one cpu only.
Yes that is also my experience, in this SYN-flood scenario one CPU does
a lot better. My old home brew AMD quad-core CPU also outperform, the
big testlab machine dual socket quad-core Nehalem.
The route cache problem, should not be too big with my SYN cookie
solution. I think... as tcp_v4_send_synack() handles alloc of a dst
route cache, but also releases it immediately afterwards.
How do you/I measure the usec per packet?
How do I disable the route cache?
What test tools do you use?
(I have modified pktgen to send TCP SYN packets)
(ps. I'll post my updated patch series, in a bit, and then I'll try not
to disturb your work on the fully parallel solution).
> perf profile is : (I have CONFIG_DEBUG_PAGEALLOC=y)
>
> + 9,55% ksoftirqd/0 [kernel.kallsyms] [k] sha_transform
> + 3,56% ksoftirqd/0 [kernel.kallsyms] [k] ip_route_input_common
> + 3,40% ksoftirqd/0 [kernel.kallsyms] [k] __ip_route_output_key
> + 3,28% ksoftirqd/0 [kernel.kallsyms] [k] __inet_lookup_established
> + 3,13% ksoftirqd/0 [kernel.kallsyms] [k] tg3_poll_work
> + 2,68% ksoftirqd/0 [kernel.kallsyms] [k] tcp_make_synack
> + 2,67% ksoftirqd/0 [kernel.kallsyms] [k] __netif_receive_skb
> + 2,51% ksoftirqd/0 [kernel.kallsyms] [k] ipt_do_table
> + 2,17% ksoftirqd/0 [kernel.kallsyms] [k] memcpy
> + 1,99% ksoftirqd/0 [kernel.kallsyms] [k] kernel_map_pages
> + 1,96% ksoftirqd/0 [kernel.kallsyms] [k] inet_csk_search_req
> + 1,69% ksoftirqd/0 [kernel.kallsyms] [k] tg3_recycle_rx.isra.36
> + 1,63% ksoftirqd/0 [kernel.kallsyms] [k] kmem_cache_free
> + 1,61% ksoftirqd/0 [kernel.kallsyms] [k] copy_user_generic_string
> + 1,49% ksoftirqd/0 [kernel.kallsyms] [k] kmem_cache_alloc
> + 1,47% ksoftirqd/0 [kernel.kallsyms] [k] ip_rcv
> + 1,11% ksoftirqd/0 [kernel.kallsyms] [k] tcp_v4_conn_request
> + 1,07% ksoftirqd/0 [kernel.kallsyms] [k] nf_iterate
> + 1,07% swapper [kernel.kallsyms] [k] sha_transform
> + 1,05% ksoftirqd/0 [kernel.kallsyms] [k] kfree
> + 1,05% ksoftirqd/0 [kernel.kallsyms] [k] skb_release_data
> + 0,99% ksoftirqd/0 [kernel.kallsyms] [k] __alloc_skb
> + 0,98% ksoftirqd/0 [kernel.kallsyms] [k] __kmalloc_node_track_caller
> + 0,97% ksoftirqd/0 [kernel.kallsyms] [k] netdev_alloc_frag
> + 0,96% ksoftirqd/0 [kernel.kallsyms] [k] dev_gro_receive
> + 0,94% ksoftirqd/0 [kernel.kallsyms] [k] inet_gro_receive
> + 0,85% ksoftirqd/0 [kernel.kallsyms] [k] build_skb
> + 0,85% ksoftirqd/0 [kernel.kallsyms] [k] cookie_v4_init_sequence
> + 0,85% ksoftirqd/0 [kernel.kallsyms] [k] ip_build_and_send_pkt
> + 0,84% ksoftirqd/0 [kernel.kallsyms] [k] __copy_skb_header
> + 0,82% ksoftirqd/0 [kernel.kallsyms] [k] nf_hook_slow
> + 0,77% ksoftirqd/0 [kernel.kallsyms] [k] __skb_clone
> + 0,73% ksoftirqd/0 [kernel.kallsyms] [k] tcp_v4_rcv
> + 0,72% ksoftirqd/0 [kernel.kallsyms] [k] xfrm_lookup
> + 0,69% ksoftirqd/0 [kernel.kallsyms] [k] dev_hard_start_xmit
> + 0,68% ksoftirqd/0 [kernel.kallsyms] [k] local_bh_enable
> + 0,67% ksoftirqd/0 [kernel.kallsyms] [k] tcp_gro_receive
> + 0,67% ksoftirqd/0 [kernel.kallsyms] [k] kfree_skb
> + 0,67% ksoftirqd/0 [kernel.kallsyms] [k] __probe_kernel_read
> + 0,67% ksoftirqd/0 [kernel.kallsyms] [k] skb_release_head_state
> + 0,66% ksoftirqd/0 [kernel.kallsyms] [k] __phys_addr
> + 0,66% ksoftirqd/0 [kernel.kallsyms] [k] ip_finish_output
> + 0,65% ksoftirqd/0 [kernel.kallsyms] [k] dst_release
> + 0,64% ksoftirqd/0 [kernel.kallsyms] [k] __ip_local_out
> + 0,61% ksoftirqd/0 [kernel.kallsyms] [k] packet_rcv_spkt
> + 0,57% ksoftirqd/0 [kernel.kallsyms] [k] __kfree_skb
^ permalink raw reply
* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-31 13:10 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: christoph.paasch, netdev, David S. Miller, Martin Topholm,
Florian Westphal, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338469100.2760.1341.camel@edumazet-glaptop>
On Thu, 2012-05-31 at 14:58 +0200, Eric Dumazet wrote:
>
> How many different IP addresses are used by your generator ?
>
> Or maybe you disabled IP route cache ?
With no route cache problems, I sustain 4 us per SYN packet, if all load
serviced by one cpu only.
perf profile is : (I have CONFIG_DEBUG_PAGEALLOC=y)
+ 9,55% ksoftirqd/0 [kernel.kallsyms] [k] sha_transform
+ 3,56% ksoftirqd/0 [kernel.kallsyms] [k] ip_route_input_common
+ 3,40% ksoftirqd/0 [kernel.kallsyms] [k] __ip_route_output_key
+ 3,28% ksoftirqd/0 [kernel.kallsyms] [k] __inet_lookup_established
+ 3,13% ksoftirqd/0 [kernel.kallsyms] [k] tg3_poll_work
+ 2,68% ksoftirqd/0 [kernel.kallsyms] [k] tcp_make_synack
+ 2,67% ksoftirqd/0 [kernel.kallsyms] [k] __netif_receive_skb
+ 2,51% ksoftirqd/0 [kernel.kallsyms] [k] ipt_do_table
+ 2,17% ksoftirqd/0 [kernel.kallsyms] [k] memcpy
+ 1,99% ksoftirqd/0 [kernel.kallsyms] [k] kernel_map_pages
+ 1,96% ksoftirqd/0 [kernel.kallsyms] [k] inet_csk_search_req
+ 1,69% ksoftirqd/0 [kernel.kallsyms] [k] tg3_recycle_rx.isra.36
+ 1,63% ksoftirqd/0 [kernel.kallsyms] [k] kmem_cache_free
+ 1,61% ksoftirqd/0 [kernel.kallsyms] [k] copy_user_generic_string
+ 1,49% ksoftirqd/0 [kernel.kallsyms] [k] kmem_cache_alloc
+ 1,47% ksoftirqd/0 [kernel.kallsyms] [k] ip_rcv
+ 1,11% ksoftirqd/0 [kernel.kallsyms] [k] tcp_v4_conn_request
+ 1,07% ksoftirqd/0 [kernel.kallsyms] [k] nf_iterate
+ 1,07% swapper [kernel.kallsyms] [k] sha_transform
+ 1,05% ksoftirqd/0 [kernel.kallsyms] [k] kfree
+ 1,05% ksoftirqd/0 [kernel.kallsyms] [k] skb_release_data
+ 0,99% ksoftirqd/0 [kernel.kallsyms] [k] __alloc_skb
+ 0,98% ksoftirqd/0 [kernel.kallsyms] [k] __kmalloc_node_track_caller
+ 0,97% ksoftirqd/0 [kernel.kallsyms] [k] netdev_alloc_frag
+ 0,96% ksoftirqd/0 [kernel.kallsyms] [k] dev_gro_receive
+ 0,94% ksoftirqd/0 [kernel.kallsyms] [k] inet_gro_receive
+ 0,85% ksoftirqd/0 [kernel.kallsyms] [k] build_skb
+ 0,85% ksoftirqd/0 [kernel.kallsyms] [k] cookie_v4_init_sequence
+ 0,85% ksoftirqd/0 [kernel.kallsyms] [k] ip_build_and_send_pkt
+ 0,84% ksoftirqd/0 [kernel.kallsyms] [k] __copy_skb_header
+ 0,82% ksoftirqd/0 [kernel.kallsyms] [k] nf_hook_slow
+ 0,77% ksoftirqd/0 [kernel.kallsyms] [k] __skb_clone
+ 0,73% ksoftirqd/0 [kernel.kallsyms] [k] tcp_v4_rcv
+ 0,72% ksoftirqd/0 [kernel.kallsyms] [k] xfrm_lookup
+ 0,69% ksoftirqd/0 [kernel.kallsyms] [k] dev_hard_start_xmit
+ 0,68% ksoftirqd/0 [kernel.kallsyms] [k] local_bh_enable
+ 0,67% ksoftirqd/0 [kernel.kallsyms] [k] tcp_gro_receive
+ 0,67% ksoftirqd/0 [kernel.kallsyms] [k] kfree_skb
+ 0,67% ksoftirqd/0 [kernel.kallsyms] [k] __probe_kernel_read
+ 0,67% ksoftirqd/0 [kernel.kallsyms] [k] skb_release_head_state
+ 0,66% ksoftirqd/0 [kernel.kallsyms] [k] __phys_addr
+ 0,66% ksoftirqd/0 [kernel.kallsyms] [k] ip_finish_output
+ 0,65% ksoftirqd/0 [kernel.kallsyms] [k] dst_release
+ 0,64% ksoftirqd/0 [kernel.kallsyms] [k] __ip_local_out
+ 0,61% ksoftirqd/0 [kernel.kallsyms] [k] packet_rcv_spkt
+ 0,57% ksoftirqd/0 [kernel.kallsyms] [k] __kfree_skb
^ permalink raw reply
* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-31 13:04 UTC (permalink / raw)
To: Eric Dumazet
Cc: christoph.paasch, netdev, David S. Miller, Martin Topholm,
Florian Westphal, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338469100.2760.1341.camel@edumazet-glaptop>
On Thu, 2012-05-31 at 14:58 +0200, Eric Dumazet wrote:
> On Thu, 2012-05-31 at 14:51 +0200, Jesper Dangaard Brouer wrote:
> > On Thu, 2012-05-31 at 00:40 +0200, Jesper Dangaard Brouer wrote:
> > > That seems like a very unlikely situation, which we perhaps should
> > > neglect as we are under SYN attack.
> > >
> > > I will test the attack vector, if we instead of dropping the reqsk,
> > > fall back into the slow locked path.
> >
> > I can provoke this attack vector, and performance is worse, if not
> > dropping the reqsk early.
> >
> > Generator SYN flood at 750Kpps, sending false retransmits mixture.
> >
> > - With early drop: 406 Kpps
> > - With return to locked processing: 251 Kpps
> >
> > Its still better than the approx 150Kpps, without any patches.
> >
>
> How many different IP addresses are used by your generator ?
In this attack I reduced the IPs to 255, and also the source port
numbers, and then simply cloned some of the SKBs. But normally I use
65535 IPs 198.18.0.0/16 (the range reserved for benchmarking).
> Or maybe you disabled IP route cache ?
Why do you think I have disabled the IP dst route cache?
^ permalink raw reply
* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-31 12:58 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: christoph.paasch, netdev, David S. Miller, Martin Topholm,
Florian Westphal, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338468693.7747.162.camel@localhost>
On Thu, 2012-05-31 at 14:51 +0200, Jesper Dangaard Brouer wrote:
> On Thu, 2012-05-31 at 00:40 +0200, Jesper Dangaard Brouer wrote:
> > That seems like a very unlikely situation, which we perhaps should
> > neglect as we are under SYN attack.
> >
> > I will test the attack vector, if we instead of dropping the reqsk,
> > fall back into the slow locked path.
>
> I can provoke this attack vector, and performance is worse, if not
> dropping the reqsk early.
>
> Generator SYN flood at 750Kpps, sending false retransmits mixture.
>
> - With early drop: 406 Kpps
> - With return to locked processing: 251 Kpps
>
> Its still better than the approx 150Kpps, without any patches.
>
How many different IP addresses are used by your generator ?
Or maybe you disabled IP route cache ?
^ permalink raw reply
* Re: [RFC PATCH 0/2] Faster/parallel SYN handling to mitigate SYN floods
From: Jesper Dangaard Brouer @ 2012-05-31 12:51 UTC (permalink / raw)
To: christoph.paasch
Cc: netdev, Eric Dumazet, David S. Miller, Martin Topholm,
Florian Westphal, Hans Schillstrom, Andi Kleen
In-Reply-To: <1338417630.7747.156.camel@localhost>
On Thu, 2012-05-31 at 00:40 +0200, Jesper Dangaard Brouer wrote:
> That seems like a very unlikely situation, which we perhaps should
> neglect as we are under SYN attack.
>
> I will test the attack vector, if we instead of dropping the reqsk,
> fall back into the slow locked path.
I can provoke this attack vector, and performance is worse, if not
dropping the reqsk early.
Generator SYN flood at 750Kpps, sending false retransmits mixture.
- With early drop: 406 Kpps
- With return to locked processing: 251 Kpps
Its still better than the approx 150Kpps, without any patches.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Sr. Network Kernel Developer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH 1/3] drivers/net: Convert compare_ether_addr to ether_addr_equal
From: Jussi Kivilinna @ 2012-05-31 12:01 UTC (permalink / raw)
To: David Miller; +Cc: joe, linville, netdev, linux-kernel, linux-wireless
In-Reply-To: <20120510.123353.1458740731067514606.davem@davemloft.net>
Quoting David Miller <davem@davemloft.net>:
> From: Joe Perches <joe@perches.com>
> Date: Thu, 10 May 2012 09:11:28 -0700
>
>> (cc's trimmed)
>>
>> On Thu, 2012-05-10 at 17:32 +0300, Jussi Kivilinna wrote:
>>> Quoting Joe Perches <joe@perches.com>:
>>> > Use the new bool function ether_addr_equal to add
>>> > some clarity and reduce the likelihood for misuse
>>> > of compare_ether_addr for sorting.
>> []
>>> > diff --git a/drivers/net/wireless/rndis_wlan.c
>> []
>>> > @@ -2139,7 +2139,7 @@ resize_buf:
>>> > while (check_bssid_list_item(bssid, bssid_len, buf, len)) {
>>> > if (rndis_bss_info_update(usbdev, bssid) && match_bssid &&
>>> > matched) {
>>> > - if (compare_ether_addr(bssid->mac, match_bssid))
>>> > + if (!ether_addr_equal(bssid->mac, match_bssid))
>>>
>>> While reviewing this, noticed that above original code is wrong. It
>>> should be !compare_ether_addr. So do I push patch fixing this through
>>> wireless-testing althought it will later cause conflict with this patch?
>>>
>>> -Jussi
>>>
>>> > *matched = true;
>>> > }
>>> >
>>
>> Up to John.
>>
>> Here's the patch I would send against net-next
>> updating the test and the style a little.
>
> I think in this specific case it's better to push this one directly
> through net-next. But yes, it's up to John.
>
It's been some time now, and I think it's ok to wait until
wireless-testing has
this patch merged and then fix it there.
That line/compare was added as response to hardware bug, where bssid-list does
not contain BSSID and other information of currently connected AP
(spec insists
that device must provide this information in the list when connected). Lack
bssid-data on current connection then causes WARN_ON somewhere in cfg80211.
Workaround was to check if bssid-list returns current bssid and if it
does not,
manually construct bssid information in other ways. And this
workaround worked,
with inverse check. Which must mean that when hardware is experiencing the
problem, it's actually returning empty bssid-list.
Inverse check causes workaround be activated when bssid-list returns only
entry, currently connected BSSID. That does not cause problems in itself, just
slightly more inaccurate information in scan-list.
-Jussi
^ permalink raw reply
* [PATCH] r8169: call netif_napi_del at errpaths and at driver unload
From: Devendra Naga @ 2012-05-31 11:51 UTC (permalink / raw)
To: Francois Romieu, netdev, linux-kernel; +Cc: Devendra Naga
when register_netdev fails, the init'ed NAPIs by netif_napi_add must be
deleted with netif_napi_del, and also when driver unloads, it should
delete the NAPI before unregistering netdevice using unregister_netdev.
Signed-off-by: Devendra Naga <devendra.aaru@gmail.com>
---
drivers/net/ethernet/realtek/r8169.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 00b4f56..9757ce3 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6345,6 +6345,8 @@ static void __devexit rtl_remove_one(struct pci_dev *pdev)
cancel_work_sync(&tp->wk.work);
+ netif_napi_del(&tp->napi);
+
unregister_netdev(dev);
rtl_release_firmware(tp);
@@ -6668,6 +6670,7 @@ out:
return rc;
err_out_msi_4:
+ netif_napi_del(&tp->napi);
rtl_disable_msi(pdev, tp);
iounmap(ioaddr);
err_out_free_res_3:
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH RFC] virtio-net: remove useless disable on freeze
From: Michael S. Tsirkin @ 2012-05-31 9:04 UTC (permalink / raw)
To: Stephen Rothwell; +Cc: kvm, netdev, linux-kernel, virtualization, Amit Shah
In-Reply-To: <20120531084717.GB32310@redhat.com>
On Thu, May 31, 2012 at 11:47:17AM +0300, Michael S. Tsirkin wrote:
> On Thu, May 31, 2012 at 06:35:08PM +1000, Stephen Rothwell wrote:
> > Hi all,
> >
> > On Wed, 30 May 2012 19:41:47 +0930 Rusty Russell <rusty@rustcorp.com.au> wrote:
> > >
> > > On Mon, 28 May 2012 15:53:25 +0300, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Wed, Apr 04, 2012 at 12:19:54PM +0300, Michael S. Tsirkin wrote:
> > > > > disable_cb is just an optimization: it
> > > > > can not guarantee that there are no callbacks.
> > > > >
> > > > > I didn't yet figure out whether a callback
> > > > > in freeze will trigger a bug, but disable_cb
> > > > > won't address it in any case. So let's remove
> > > > > the useless calls as a first step.
> > > > >
> > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > >
> > > > Looks like this isn't in the 3.5 pull request -
> > > > just lost in the shuffle?
> > > > disable_cb is advisory so can't be relied upon.
> > >
> > > I always (try to?) reply as I accept patches.
> > >
> > > This one did slip by, but it's harmless so no need to push AFAICT.
> > >
> > > Applied.
> >
> > This patch exists in two trees in linux-next already ... Davem's net tree
> > (so presumably he will send it to Linus shortly) and Michael's vhost tree
> > (is that tree needed any more?).
>
> Yes and I dropped the patch from there, just did not push yet.
pushed.
There's usually not too much in my tree but it helps me a lot
that it's in linux-next.
> > Presumably it is now also in the rr
> > tree?
> >
> > --
> > Cheers,
> > Stephen Rothwell sfr@canb.auug.org.au
>
>
^ permalink raw reply
* Re: [PATCH RFC] virtio-net: remove useless disable on freeze
From: Michael S. Tsirkin @ 2012-05-31 8:47 UTC (permalink / raw)
To: Stephen Rothwell; +Cc: kvm, netdev, linux-kernel, virtualization, Amit Shah
In-Reply-To: <20120531183508.f426eb25fe1b94139c637348@canb.auug.org.au>
On Thu, May 31, 2012 at 06:35:08PM +1000, Stephen Rothwell wrote:
> Hi all,
>
> On Wed, 30 May 2012 19:41:47 +0930 Rusty Russell <rusty@rustcorp.com.au> wrote:
> >
> > On Mon, 28 May 2012 15:53:25 +0300, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Wed, Apr 04, 2012 at 12:19:54PM +0300, Michael S. Tsirkin wrote:
> > > > disable_cb is just an optimization: it
> > > > can not guarantee that there are no callbacks.
> > > >
> > > > I didn't yet figure out whether a callback
> > > > in freeze will trigger a bug, but disable_cb
> > > > won't address it in any case. So let's remove
> > > > the useless calls as a first step.
> > > >
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > >
> > > Looks like this isn't in the 3.5 pull request -
> > > just lost in the shuffle?
> > > disable_cb is advisory so can't be relied upon.
> >
> > I always (try to?) reply as I accept patches.
> >
> > This one did slip by, but it's harmless so no need to push AFAICT.
> >
> > Applied.
>
> This patch exists in two trees in linux-next already ... Davem's net tree
> (so presumably he will send it to Linus shortly) and Michael's vhost tree
> (is that tree needed any more?).
Yes and I dropped the patch from there, just did not push yet.
> Presumably it is now also in the rr
> tree?
>
> --
> Cheers,
> Stephen Rothwell sfr@canb.auug.org.au
^ permalink raw reply
* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Hans Schillstrom @ 2012-05-31 8:45 UTC (permalink / raw)
To: Eric Dumazet
Cc: Rick Jones, Andi Kleen, Jesper Dangaard Brouer,
Jesper Dangaard Brouer, netdev@vger.kernel.org, Christoph Paasch,
David S. Miller, Martin Topholm, Florian Westphal, Tom Herbert
In-Reply-To: <1338452917.2760.1309.camel@edumazet-glaptop>
On Thursday 31 May 2012 10:28:37 Eric Dumazet wrote:
> On Wed, 2012-05-30 at 14:20 -0700, Rick Jones wrote:
>
> > It may still be high, but a very quick netperf TCP_CC test over loopback
> > on a W3550 system running a 2.6.38 kernel shows:
> >
> > raj@tardy:~/netperf2_trunk/src$ ./netperf -t TCP_CC -l 60 -c -C
> > TCP Connect/Close TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > localhost.localdomain () port 0 AF_INET
> > Local /Remote
> > Socket Size Request Resp. Elapsed Trans. CPU CPU S.dem S.dem
> > Send Recv Size Size Time Rate local remote local remote
> > bytes bytes bytes bytes secs. per sec % % us/Tr us/Tr
> >
> > 16384 87380 1 1 60.00 21515.29 30.68 30.96 57.042 57.557
> > 16384 87380
> >
> > 57 microseconds per "transaction" which in this case is establishing and
> > tearing-down the connection, with nothing else (no data packets) makes
> > 19 microseconds for a SYN seem perhaps not all that beyond the realm of
> > possibility?
>
> Thats a different story, on loopback device (without stressing IP route
> cache by the way)
>
> Your netperf test is a full userspace transactions, and 5 frames per
> transaction. Two sockets creation/destruction, process scheduler
> activations, and not enter syncookie mode.
>
> In case of synflood/(syncookies on), we receive a packet and send one
> from softirq.
>
> One expensive thing might be the md5 to compute the SYNACK sequence.
>
> I suspect other things :
>
> 1) Of course we have to take into account the timer responsible for
> SYNACK retransmits of previously queued requests. Its cost depends on
> the listen backlog. When this timer runs, listen socket is locked.
>
> 2) IP route cache overflows.
> In case of SYNFLOOD, we should not store dst(s) in route cache but
> destroy them immediately.
>
I can see plenty "IPv4: dst cache overflow"
^ permalink raw reply
* Re: [PATCH RFC] virtio-net: remove useless disable on freeze
From: Stephen Rothwell @ 2012-05-31 8:35 UTC (permalink / raw)
To: Rusty Russell
Cc: kvm, Michael S. Tsirkin, netdev, linux-kernel, virtualization,
Amit Shah
In-Reply-To: <87r4u2dllo.fsf@rustcorp.com.au>
[-- Attachment #1.1: Type: text/plain, Size: 1213 bytes --]
Hi all,
On Wed, 30 May 2012 19:41:47 +0930 Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> On Mon, 28 May 2012 15:53:25 +0300, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Apr 04, 2012 at 12:19:54PM +0300, Michael S. Tsirkin wrote:
> > > disable_cb is just an optimization: it
> > > can not guarantee that there are no callbacks.
> > >
> > > I didn't yet figure out whether a callback
> > > in freeze will trigger a bug, but disable_cb
> > > won't address it in any case. So let's remove
> > > the useless calls as a first step.
> > >
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > Looks like this isn't in the 3.5 pull request -
> > just lost in the shuffle?
> > disable_cb is advisory so can't be relied upon.
>
> I always (try to?) reply as I accept patches.
>
> This one did slip by, but it's harmless so no need to push AFAICT.
>
> Applied.
This patch exists in two trees in linux-next already ... Davem's net tree
(so presumably he will send it to Linus shortly) and Michael's vhost tree
(is that tree needed any more?). Presumably it is now also in the rr
tree?
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
[-- Attachment #1.2: Type: application/pgp-signature, Size: 836 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH 2/2] tcp: Early SYN limit and SYN cookie handling to mitigate SYN floods
From: Eric Dumazet @ 2012-05-31 8:28 UTC (permalink / raw)
To: Rick Jones
Cc: Hans Schillstrom, Andi Kleen, Jesper Dangaard Brouer,
Jesper Dangaard Brouer, netdev@vger.kernel.org, Christoph Paasch,
David S. Miller, Martin Topholm, Florian Westphal, Tom Herbert
In-Reply-To: <4FC68F21.1040402@hp.com>
On Wed, 2012-05-30 at 14:20 -0700, Rick Jones wrote:
> It may still be high, but a very quick netperf TCP_CC test over loopback
> on a W3550 system running a 2.6.38 kernel shows:
>
> raj@tardy:~/netperf2_trunk/src$ ./netperf -t TCP_CC -l 60 -c -C
> TCP Connect/Close TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> localhost.localdomain () port 0 AF_INET
> Local /Remote
> Socket Size Request Resp. Elapsed Trans. CPU CPU S.dem S.dem
> Send Recv Size Size Time Rate local remote local remote
> bytes bytes bytes bytes secs. per sec % % us/Tr us/Tr
>
> 16384 87380 1 1 60.00 21515.29 30.68 30.96 57.042 57.557
> 16384 87380
>
> 57 microseconds per "transaction" which in this case is establishing and
> tearing-down the connection, with nothing else (no data packets) makes
> 19 microseconds for a SYN seem perhaps not all that beyond the realm of
> possibility?
Thats a different story, on loopback device (without stressing IP route
cache by the way)
Your netperf test is a full userspace transactions, and 5 frames per
transaction. Two sockets creation/destruction, process scheduler
activations, and not enter syncookie mode.
In case of synflood/(syncookies on), we receive a packet and send one
from softirq.
One expensive thing might be the md5 to compute the SYNACK sequence.
I suspect other things :
1) Of course we have to take into account the timer responsible for
SYNACK retransmits of previously queued requests. Its cost depends on
the listen backlog. When this timer runs, listen socket is locked.
2) IP route cache overflows.
In case of SYNFLOOD, we should not store dst(s) in route cache but
destroy them immediately.
^ permalink raw reply
* Re: [PATCH net 3/3] bql: Avoid possible inconsistent calculation.
From: Eric Dumazet @ 2012-05-31 8:01 UTC (permalink / raw)
To: Hiroaki SHIMODA; +Cc: davem, therbert, denys, netdev
In-Reply-To: <20120531072537.920f0cb0.shimoda.hiroaki@gmail.com>
On Thu, 2012-05-31 at 07:25 +0900, Hiroaki SHIMODA wrote:
> dql->num_queued could change while processing dql_completed().
> To provide consistent calculation, added an on stack variable.
>
> Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Denys Fedoryshchenko <denys@visp.net.lb>
> ---
> lib/dynamic_queue_limits.c | 12 +++++++-----
> 1 files changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
> index 0fafa77..0777c5a 100644
> --- a/lib/dynamic_queue_limits.c
> +++ b/lib/dynamic_queue_limits.c
> @@ -17,16 +17,18 @@
> void dql_completed(struct dql *dql, unsigned int count)
> {
> unsigned int inprogress, prev_inprogress, limit;
> - unsigned int ovlimit, completed;
> + unsigned int ovlimit, completed, num_queued;
> bool all_prev_completed;
>
> + num_queued = ACCESS_ONCE(dql->num_queued);
> +
> /* Can't complete more than what's in queue */
> - BUG_ON(count > dql->num_queued - dql->num_completed);
> + BUG_ON(count > num_queued - dql->num_completed);
>
> completed = dql->num_completed + count;
> limit = dql->limit;
> - ovlimit = POSDIFF(dql->num_queued - dql->num_completed, limit);
> - inprogress = dql->num_queued - completed;
> + ovlimit = POSDIFF(num_queued - dql->num_completed, limit);
> + inprogress = num_queued - completed;
> prev_inprogress = dql->prev_num_queued - dql->num_completed;
> all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued);
>
> @@ -106,7 +108,7 @@ void dql_completed(struct dql *dql, unsigned int count)
> dql->prev_ovlimit = ovlimit;
> dql->prev_last_obj_cnt = dql->last_obj_cnt;
> dql->num_completed = completed;
> - dql->prev_num_queued = dql->num_queued;
> + dql->prev_num_queued = num_queued;
> }
> EXPORT_SYMBOL(dql_completed);
>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Thanks !
(I am wondering if using a sequence would be safer, so that we can
restart the computation in case a concurrent writer changes
'num_queued')
^ permalink raw reply
* Re: [PATCH net 2/3] bql: Avoid unneeded limit decrement.
From: Eric Dumazet @ 2012-05-31 7:59 UTC (permalink / raw)
To: Hiroaki SHIMODA; +Cc: davem, therbert, denys, netdev
In-Reply-To: <20120531072519.16464513.shimoda.hiroaki@gmail.com>
On Thu, 2012-05-31 at 07:25 +0900, Hiroaki SHIMODA wrote:
...
> Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Denys Fedoryshchenko <denys@visp.net.lb>
> ---
> lib/dynamic_queue_limits.c | 6 ++++--
> 1 files changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
> index c87eb76..0fafa77 100644
> --- a/lib/dynamic_queue_limits.c
> +++ b/lib/dynamic_queue_limits.c
> @@ -11,12 +11,14 @@
> #include <linux/dynamic_queue_limits.h>
>
> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
> +#define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
>
> /* Records completed count and recalculates the queue limit */
> void dql_completed(struct dql *dql, unsigned int count)
> {
> unsigned int inprogress, prev_inprogress, limit;
> - unsigned int ovlimit, all_prev_completed, completed;
> + unsigned int ovlimit, completed;
> + bool all_prev_completed;
>
> /* Can't complete more than what's in queue */
> BUG_ON(count > dql->num_queued - dql->num_completed);
> @@ -26,7 +28,7 @@ void dql_completed(struct dql *dql, unsigned int count)
> ovlimit = POSDIFF(dql->num_queued - dql->num_completed, limit);
> inprogress = dql->num_queued - completed;
> prev_inprogress = dql->prev_num_queued - dql->num_completed;
> - all_prev_completed = POSDIFF(completed, dql->prev_num_queued);
> + all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued);
>
> if ((ovlimit && !inprogress) ||
> (dql->prev_ovlimit && all_prev_completed)) {
Acked-by: Eric Dumazet <edumazet@google.com>
Thanks !
^ permalink raw reply
* Re: [PATCH net 1/3] bql: Fix POSDIFF() to integer overflow aware.
From: Eric Dumazet @ 2012-05-31 7:58 UTC (permalink / raw)
To: Hiroaki SHIMODA; +Cc: davem, therbert, denys, netdev
In-Reply-To: <20120531072439.6c634a0b.shimoda.hiroaki@gmail.com>
On Thu, 2012-05-31 at 07:24 +0900, Hiroaki SHIMODA wrote:
> POSDIFF() fails to take into account integer overflow case.
>
> Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Denys Fedoryshchenko <denys@visp.net.lb>
> ---
> lib/dynamic_queue_limits.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
> index 6ab4587..c87eb76 100644
> --- a/lib/dynamic_queue_limits.c
> +++ b/lib/dynamic_queue_limits.c
> @@ -10,7 +10,7 @@
> #include <linux/jiffies.h>
> #include <linux/dynamic_queue_limits.h>
>
> -#define POSDIFF(A, B) ((A) > (B) ? (A) - (B) : 0)
> +#define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
>
> /* Records completed count and recalculates the queue limit */
> void dql_completed(struct dql *dql, unsigned int count)
Acked-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply
* [News] Help for Heartache - May, 2012
From: How to Heal a Broken Heart @ 2012-05-31 6:46 UTC (permalink / raw)
To: netdev
If you or any of your associates, clients or prospects are suffering from the pain of losing a loved one, they may be relieved by the companionship and counsel they can find on a new website: http://rx4heartache.com. Please feel free to pass on the good news that relief can be found there. It's a place where they can exchange heartache for hopefulness. Thanks to you in advance for sharing this good news with anyone who suffers a broken heart from the loss of someone they once loved.
^ permalink raw reply
* [V2 PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Jason Wang @ 2012-05-31 7:18 UTC (permalink / raw)
To: netdev, edumazet, mst, davem, linux-kernel; +Cc: stable
We need to validate the number of pages consumed by data_len, otherwise frags
array could be overflowed by userspace. So this patch validate data_len and
return -EMSGSIZE when data_len may occupies more frags than MAX_SKB_FRAGS.
Cc: stable@vger.kernel.org [2.6.27+]
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
net/core/sock.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/core/sock.c b/net/core/sock.c
index 653f8c0..9e5b71f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1592,6 +1592,11 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
gfp_t gfp_mask;
long timeo;
int err;
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+
+ err = -EMSGSIZE;
+ if (npages > MAX_SKB_FRAGS)
+ goto failure;
gfp_mask = sk->sk_allocation;
if (gfp_mask & __GFP_WAIT)
@@ -1610,14 +1615,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
skb = alloc_skb(header_len, gfp_mask);
if (skb) {
- int npages;
int i;
/* No pages, we're done... */
if (!data_len)
break;
- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
skb->truesize += data_len;
skb_shinfo(skb)->nr_frags = npages;
for (i = 0; i < npages; i++) {
^ permalink raw reply related
* Re: Difficulties to get 1Gbps on be2net ethernet card
From: Jean-Michel Hautbois @ 2012-05-31 6:54 UTC (permalink / raw)
To: Sathya.Perla; +Cc: eric.dumazet, netdev, yevgenyp
In-Reply-To: <3367B80B08154D42A3B2BC708B5D41F647C678B73F@EXMAIL.ad.emulex.com>
2012/5/30 <Sathya.Perla@emulex.com>:
>>-----Original Message-----
>>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
>>Behalf Of Jean-Michel Hautbois
>>
>>2012/5/30 Jean-Michel Hautbois <jhautbois@gmail.com>:
>>
>>I used vmstat in order to see the differences between the two kernels.
>>The main difference is the number of interrupts per second.
>>I have an average of 87500 on 3.2 and 7500 on 2.6, 10 times lower !
>>I suspect the be2net driver to be the main cause, and I checkes the
>>/proc/interrupts file in order to be sure.
>>
>>I have for eth1-tx on 2.6.26 about 2200 interrupts per second and 23000 on 3.2.
>>BTW, it is named eth1-q0 on 3.2 (and tx and rx are the same IRQ)
>>whereas there is eth1-rx0 and eth1-tx on 2.6.26.
>
> Yes, there is an issue with be2net interrupt mitigation in the recent code with
> RX and TX on the same Evt-Q (commit 10ef9ab4). The high interrupt rate happens when a TX blast is
> done while RX is relatively silent on a queue pair. Interrupt rate due to TX completions is not being
> mitigated.
>
> I have a fix and will send it out soon..
>
> thanks,
> -Sathya
It seems this issue exist with mellanox mlx4 too...
I have a bond between eth1 (be2net) and eth9 (mlx4_en) and I can
switch from one to the other using ifenslave.
Setting a queue of 4000 on be2net works well in my use case, when I
switch to mlx4 based NIC which has a default queue len of 1000, I have
dropped packets too (less than be2net, but about 30-50 per second).
Setting a queue len of 4000 on mlx4 works too, but the number of
interrupts is similar.
The use case is the same : Lots of TX, no RX.
JM
^ permalink raw reply
* Re: [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Jason Wang @ 2012-05-31 6:43 UTC (permalink / raw)
To: Eric Dumazet
Cc: Michael S. Tsirkin, David Miller, netdev, linux-kernel, stable
In-Reply-To: <1338445232.2760.1171.camel@edumazet-glaptop>
On 05/31/2012 02:20 PM, Eric Dumazet wrote:
> On Thu, 2012-05-31 at 14:11 +0800, Jason Wang wrote:
>
>> Not affected, only code duplication. It's no harm the check the data_len
>> again for packet sockets, so better to unify the code and fix the issue
>> in one place?
> As a matter of fact, we currently allocate order-0 pages, but it could
> be nice trying to use order-1 or order-2 pages, on arches where
> PAGE_SIZE is so small (4096 bytes)
>
> So lets do this test in sock_alloc_send_pskb() to allow future changes.
>
> af_unix is kind of special, because it tries to lower risk of high order
> linear allocation failures. And for small sizes, it wants linear skbs to
> have no performance regression (prior kernels were allocating linear
> skbs)
>
Thanks for the clarification, would post V2.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply
* RE: [PATCH net 0/6] batch of mlx4 fixes, mostly to SRIOV
From: Yevgeny Petrilin @ 2012-05-31 6:37 UTC (permalink / raw)
To: David Miller; +Cc: netdev@vger.kernel.org, Or Gerlitz, jackm@dev.mellanox.co.il
In-Reply-To: <20120530.163700.720524088985729456.davem@davemloft.net>
>
> Why did you send multiple copies of some of these patches to the list?
>
> Patches #1, #2, and #5 showed up twice.
Hi Dave,
Sorry for this, I had some problem with mail client and thought that those 3 weren't sent at the first time.
Yevgeny
^ permalink raw reply
* Re: [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Eric Dumazet @ 2012-05-31 6:20 UTC (permalink / raw)
To: Jason Wang; +Cc: Michael S. Tsirkin, David Miller, netdev, linux-kernel, stable
In-Reply-To: <4FC70BA8.5060200@redhat.com>
On Thu, 2012-05-31 at 14:11 +0800, Jason Wang wrote:
> Not affected, only code duplication. It's no harm the check the data_len
> again for packet sockets, so better to unify the code and fix the issue
> in one place?
As a matter of fact, we currently allocate order-0 pages, but it could
be nice trying to use order-1 or order-2 pages, on arches where
PAGE_SIZE is so small (4096 bytes)
So lets do this test in sock_alloc_send_pskb() to allow future changes.
af_unix is kind of special, because it tries to lower risk of high order
linear allocation failures. And for small sizes, it wants linear skbs to
have no performance regression (prior kernels were allocating linear
skbs)
^ permalink raw reply
* Re: [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Jason Wang @ 2012-05-31 6:11 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: David Miller, eric.dumazet, netdev, linux-kernel, stable
In-Reply-To: <20120531060201.GA13158@redhat.com>
On 05/31/2012 02:02 PM, Michael S. Tsirkin wrote:
> On Thu, May 31, 2012 at 02:00:14PM +0800, Jason Wang wrote:
>> On 05/30/2012 03:02 PM, David Miller wrote:
>>> From: Eric Dumazet<eric.dumazet@gmail.com>
>>> Date: Wed, 30 May 2012 08:46:23 +0200
>>>
>>>> Why doing this test in the while (1) block, it should be done before the
>>>> loop...
>>>>
>>>> Or even in the caller, note net/unix/af_unix.c does this right.
>>>>
>>>> if (len> SKB_MAX_ALLOC)
>>>> data_len = min_t(size_t,
>>>> len - SKB_MAX_ALLOC,
>>>> MAX_SKB_FRAGS * PAGE_SIZE);
>>>>
>>>> skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
>>>> msg->msg_flags& MSG_DONTWAIT,&err);
>>> My impression is that the callers should be fixed to. It makes no sense
>>> to penalize the call sites that get this right.
>>>
>>> And yes, if we do check it in sock_alloc_send_pskb() it should be done
>>> at function entry, not inside the loop.
>> Sure, so is it ok for me to send a V2 that just do the fixing in
>> sock_alloc_sned_pskb() as it's simple and easy to be accepted by
>> stable version?
>>
>> For the fix of callers, I want to post fixes on top as I find
>> there's some code duplication of {tun|macvtap|packet}_alloc_skb()
>> and I want to unify them to a common helper in sock.c. Then I can
>> fix this issue in the new helper.
> Are packet sockets really affected?
> If yes the only call site that gets this right is unix sockets?
Not affected, only code duplication. It's no harm the check the data_len
again for packet sockets, so better to unify the code and fix the issue
in one place?
>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH] net: sock: validate data_len before allocating skb in sock_alloc_send_pskb()
From: Michael S. Tsirkin @ 2012-05-31 6:02 UTC (permalink / raw)
To: Jason Wang; +Cc: David Miller, eric.dumazet, netdev, linux-kernel, stable
In-Reply-To: <4FC708EE.2020908@redhat.com>
On Thu, May 31, 2012 at 02:00:14PM +0800, Jason Wang wrote:
> On 05/30/2012 03:02 PM, David Miller wrote:
> >From: Eric Dumazet<eric.dumazet@gmail.com>
> >Date: Wed, 30 May 2012 08:46:23 +0200
> >
> >>Why doing this test in the while (1) block, it should be done before the
> >>loop...
> >>
> >>Or even in the caller, note net/unix/af_unix.c does this right.
> >>
> >> if (len> SKB_MAX_ALLOC)
> >> data_len = min_t(size_t,
> >> len - SKB_MAX_ALLOC,
> >> MAX_SKB_FRAGS * PAGE_SIZE);
> >>
> >> skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
> >> msg->msg_flags& MSG_DONTWAIT,&err);
> >My impression is that the callers should be fixed to. It makes no sense
> >to penalize the call sites that get this right.
> >
> >And yes, if we do check it in sock_alloc_send_pskb() it should be done
> >at function entry, not inside the loop.
>
> Sure, so is it ok for me to send a V2 that just do the fixing in
> sock_alloc_sned_pskb() as it's simple and easy to be accepted by
> stable version?
>
> For the fix of callers, I want to post fixes on top as I find
> there's some code duplication of {tun|macvtap|packet}_alloc_skb()
> and I want to unify them to a common helper in sock.c. Then I can
> fix this issue in the new helper.
Are packet sockets really affected?
If yes the only call site that gets this right is unix sockets?
> >--
> >To unsubscribe from this list: send the line "unsubscribe netdev" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox