From: Eric Dumazet <eric.dumazet@gmail.com>
To: Christoph Lameter <cl@linux-foundation.org>
Cc: "David S. Miller" <davem@davemloft.net>,
Vegard Nossum <vegard.nossum@gmail.com>,
Linux Netdev List <netdev@vger.kernel.org>,
Ingo Molnar <mingo@elte.hu>
Subject: Re: [PATCH] net: Fix struct sock bitfield annotation
Date: Fri, 09 Oct 2009 22:41:54 +0200 [thread overview]
Message-ID: <4ACFA012.6010409@gmail.com> (raw)
In-Reply-To: <alpine.DEB.1.10.0910091539170.32209@gentwo.org>
Christoph Lameter a écrit :
> On Fri, 9 Oct 2009, Eric Dumazet wrote:
>
>> For networking guys, here is the actual mess with "struct sock" on x86_64,
>> related to UDP handling (critical latencies for some people). We basically touch
>> all cache lines, in every paths, bad effects on SMP...
>
> Please keep me posted on this. I am very interested in this work.
>
> Some simple shuffling around may do some good here.
>
Sure, will do, but first I want to suppress the lock_sock()/release_sock() in
rx path, that was added for sk_forward_alloc thing. This really hurts,
because of the backlog handling.
I have preliminary patch that restore UDP latencies we had in the past ;)
Trick is for UDP, sk_forward_alloc is not updated by tx/rx, only rx.
So we can use the sk_receive_queue.lock to forbid concurrent updates.
As this lock is already hot and only used by rx, we wont have to
dirty the sk_lock, that will only be used by tx path.
Then we can carefuly reorder struct sock to lower number of cache lines
needed for each path.
Patch against linux-2.6 git tree
net/core/sock.c | 9 ++++
net/ipv4/udp.c | 89 ++++++++++++++++++++++------------------------
2 files changed, 51 insertions(+), 47 deletions(-)
diff --git a/net/core/sock.c b/net/core/sock.c
index 7626b6a..45212d4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -276,6 +276,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
int skb_len;
+ unsigned long flags;
/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with -W --ANK
@@ -290,8 +291,12 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (err)
goto out;
+ skb_orphan(skb);
+
+ spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
if (!sk_rmem_schedule(sk, skb->truesize)) {
err = -ENOBUFS;
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
goto out;
}
@@ -305,7 +310,9 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
skb_len = skb->len;
- skb_queue_tail(&sk->sk_receive_queue, skb);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6ec6a8a..e8a1be4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -841,6 +841,36 @@ out:
return ret;
}
+
+/**
+ * first_packet_length - return length of first packet in receive queue
+ * @sk: socket
+ *
+ * Drops all bad checksum frames, until a valid one is found.
+ * Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+ struct sk_buff *skb;
+ unsigned int res;
+
+ spin_lock_bh(&rcvq->lock);
+
+ while ((skb = skb_peek(rcvq)) != NULL &&
+ udp_lib_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ __skb_unlink(skb, rcvq);
+ skb_kill_datagram(sk, skb, 0);
+ }
+ res = skb ? skb->len : 0;
+
+ spin_unlock_bh(&rcvq->lock);
+
+ return res;
+}
+
/*
* IOCTL requests applicable to the UDP protocol
*/
@@ -857,21 +887,16 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
case SIOCINQ:
{
- struct sk_buff *skb;
- unsigned long amount;
+ unsigned int amount = first_packet_length(sk);
- amount = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL) {
+ if (amount)
/*
* We will only return the amount
* of this packet since that is all
* that will be read.
*/
- amount = skb->len - sizeof(struct udphdr);
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ amount -= sizeof(struct udphdr);
+
return put_user(amount, (int __user *)arg);
}
@@ -968,17 +993,17 @@ try_again:
err = ulen;
out_free:
- lock_sock(sk);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
skb_free_datagram(sk, skb);
- release_sock(sk);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
out:
return err;
csum_copy_err:
- lock_sock(sk);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
if (!skb_kill_datagram(sk, skb, flags))
- UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- release_sock(sk);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
if (noblock)
return -EAGAIN;
@@ -1060,7 +1085,6 @@ drop:
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
/*
@@ -1140,16 +1164,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
}
- rc = 0;
-
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- rc = __udp_queue_rcv_skb(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
-
- return rc;
+ return __udp_queue_rcv_skb(sk, skb);
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
@@ -1540,29 +1555,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
- int is_lite = IS_UDPLITE(sk);
/* Check for false positives due to checksum errors */
- if ((mask & POLLRDNORM) &&
- !(file->f_flags & O_NONBLOCK) &&
- !(sk->sk_shutdown & RCV_SHUTDOWN)) {
- struct sk_buff_head *rcvq = &sk->sk_receive_queue;
- struct sk_buff *skb;
-
- spin_lock_bh(&rcvq->lock);
- while ((skb = skb_peek(rcvq)) != NULL &&
- udp_lib_checksum_complete(skb)) {
- UDP_INC_STATS_BH(sock_net(sk),
- UDP_MIB_INERRORS, is_lite);
- __skb_unlink(skb, rcvq);
- kfree_skb(skb);
- }
- spin_unlock_bh(&rcvq->lock);
-
- /* nothing to see, move along */
- if (skb == NULL)
- mask &= ~(POLLIN | POLLRDNORM);
- }
+ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+ mask &= ~(POLLIN | POLLRDNORM);
return mask;
next prev parent reply other threads:[~2009-10-09 20:42 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-10-08 15:16 [PATCH] net: Fix struct sock bitfield annotation Eric Dumazet
2009-10-08 21:31 ` David Miller
2009-10-08 21:54 ` Vegard Nossum
2009-10-08 22:08 ` David Miller
2009-10-09 1:07 ` Eric Dumazet
2009-10-09 1:46 ` Eric Dumazet
2009-10-09 19:39 ` Christoph Lameter
2009-10-09 20:41 ` Eric Dumazet [this message]
2009-10-13 21:59 ` [RFC net-next-2.6] udp: Dont use lock_sock()/release_sock() in rx path Eric Dumazet
2009-10-09 7:54 ` [PATCH] net: Fix struct sock bitfield annotation David Miller
2009-10-09 8:50 ` Eric Dumazet
2009-10-12 6:07 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4ACFA012.6010409@gmail.com \
--to=eric.dumazet@gmail.com \
--cc=cl@linux-foundation.org \
--cc=davem@davemloft.net \
--cc=mingo@elte.hu \
--cc=netdev@vger.kernel.org \
--cc=vegard.nossum@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.