From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
To: netdev@vger.kernel.org
Cc: Willem de Bruijn <willemb@google.com>
Subject: [PATCH RFC net-next 07/11] udp: zerocopy
Date: Tue, 17 Apr 2018 16:00:57 -0400 [thread overview]
Message-ID: <20180417200059.30154-8-willemdebruijn.kernel@gmail.com> (raw)
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and
interpret flag MSG_ZEROCOPY.
This patch was previously part of the zerocopy RFC patchsets. Zerocopy
is not effective at small MTU. With segmentation offload building
larger datagram, the benefit of page flipping outweights the cost of
generating a completion notification.
The datagram implementation has initial refcnt of 0, instead of 1 for
TCP. The tcp_sendmsg_locked function has to hold a reference on
uarg independent those held by the skbs it generates, because the
skbs can be sent and freed in the function main loop. This is not
needed for other sockets.
Benefit depends on the ratio of cycles spent copying. For an initial
benchmark with udp gso that spends 13% of (systemwide) cycles in
copy_user_fast_string, cycle savings of zerocopy are proportionate:
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
udp gso zerocopy
2394 MB/s 40608 msg/s 40608 calls/s
11,205,017,927 cycles
This is likely not the best demonstrator benchmark.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/linux/skbuff.h | 1 +
net/core/skbuff.c | 12 +++++++++++-
net/core/sock.c | 5 ++++-
net/ipv4/ip_output.c | 24 ++++++++++++++++++++++--
net/ipv6/ip6_output.c | 23 ++++++++++++++++++++++-
5 files changed, 60 insertions(+), 5 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6850643508c1..756206bc8eca 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,6 +483,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg);
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3eed21f64e0b..92df6db2c851 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -959,7 +959,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- refcount_set(&uarg->refcnt, 1);
+ refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
sock_hold(sk);
return uarg;
@@ -1099,6 +1099,10 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey);
uarg->len--;
+ /* datagram does not hold an extra ref for the syscall itself */
+ if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
+ refcount_set(&uarg->refcnt, 1);
+
sock_zerocopy_put(uarg);
}
}
@@ -1107,6 +1111,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
+{
+ return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db169ca1..1480d7d92294 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1053,7 +1053,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (sk->sk_protocol != IPPROTO_TCP)
+ if (sk->sk_type == SOCK_RAW)
+ ret = -ENOTSUPP;
+ if (sk->sk_protocol != IPPROTO_TCP &&
+ sk->sk_protocol != IPPROTO_UDP)
ret = -ENOTSUPP;
} else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9ccd6c28e420..b5986ff1250e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -864,8 +864,8 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
-
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
@@ -913,6 +913,20 @@ static int __ip_append_data(struct sock *sk,
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg);
+ }
+ }
+
cork->length += length;
/* So, what's going on in the loop below?
@@ -1002,6 +1016,7 @@ static int __ip_append_data(struct sock *sk,
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
+ skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes.
@@ -1063,7 +1078,7 @@ static int __ip_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1093,6 +1108,10 @@ static int __ip_append_data(struct sock *sk,
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1105,6 +1124,7 @@ static int __ip_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9fbcec4fb946..9725d283091c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1270,6 +1270,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
+ struct ubuf_info *uarg = NULL;
bool paged;
skb = skb_peek_tail(queue);
@@ -1338,6 +1339,20 @@ static int __ip6_append_data(struct sock *sk,
tskey = sk->sk_tskey++;
}
+ if (flags & MSG_ZEROCOPY && length) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg);
+ }
+ }
+
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1460,6 +1475,7 @@ static int __ip6_append_data(struct sock *sk,
tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
+ skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes
@@ -1520,7 +1536,7 @@ static int __ip6_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1550,6 +1566,10 @@ static int __ip6_append_data(struct sock *sk,
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1562,6 +1582,7 @@ static int __ip6_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
--
2.17.0.484.g0c8726318c-goog
next prev parent reply other threads:[~2018-04-17 20:01 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-04-17 20:00 [PATCH RFC net-next 00/11] udp gso Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 01/11] udp: expose inet cork to udp Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 02/11] udp: add gso Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 03/11] udp: better wmem accounting on gso Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 04/11] udp: paged allocation with gso Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 05/11] udp: add gso segment cmsg Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 06/11] udp: add gso support to virtual devices Willem de Bruijn
2018-04-18 0:43 ` Dimitris Michailidis
2018-04-18 3:27 ` Willem de Bruijn
2018-04-17 20:00 ` Willem de Bruijn [this message]
2018-04-17 20:00 ` [PATCH RFC net-next 08/11] selftests: udp gso Willem de Bruijn
2018-04-17 20:00 ` [PATCH RFC net-next 09/11] selftests: udp gso with connected sockets Willem de Bruijn
2018-04-17 20:15 ` [PATCH RFC net-next 00/11] udp gso Sowmini Varadhan
2018-04-17 20:23 ` Willem de Bruijn
2018-04-17 20:48 ` Sowmini Varadhan
2018-04-17 21:07 ` Willem de Bruijn
2018-04-18 2:25 ` Samudrala, Sridhar
2018-04-18 3:33 ` Willem de Bruijn
2018-04-18 12:31 ` Sowmini Varadhan
2018-04-18 13:35 ` Eric Dumazet
2018-04-18 13:47 ` Sowmini Varadhan
2018-04-18 13:51 ` Willem de Bruijn
2018-04-18 15:08 ` Samudrala, Sridhar
2018-04-18 17:40 ` David Miller
2018-04-18 17:34 ` David Miller
2018-04-18 13:59 ` Willem de Bruijn
2018-04-18 14:28 ` Willem de Bruijn
2018-04-18 17:28 ` David Miller
2018-04-18 18:12 ` Alexander Duyck
2018-04-18 18:22 ` Willem de Bruijn
2018-04-20 17:38 ` Alexander Duyck
2018-04-20 21:58 ` Willem de Bruijn
2018-04-21 2:08 ` Alexander Duyck
2018-04-18 19:33 ` David Miller
2018-04-20 18:27 ` Tushar Dave
2018-04-20 20:08 ` Alexander Duyck
2018-04-21 3:11 ` Tushar Dave
2018-08-31 9:09 ` Paolo Abeni
2018-08-31 10:09 ` Eric Dumazet
2018-08-31 13:08 ` Willem de Bruijn
2018-08-31 13:44 ` Paolo Abeni
2018-08-31 15:11 ` Willem de Bruijn
2018-09-03 8:02 ` Steffen Klassert
2018-09-03 11:45 ` Sowmini Varadhan
2018-04-18 11:17 ` Paolo Abeni
2018-04-18 13:49 ` Willem de Bruijn
2018-05-24 0:02 ` Marcelo Ricardo Leitner
2018-05-24 1:15 ` Willem de Bruijn
2018-04-18 17:24 ` David Miller
2018-04-18 17:50 ` David Miller
2018-04-18 18:12 ` Willem de Bruijn
2018-04-19 17:45 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180417200059.30154-8-willemdebruijn.kernel@gmail.com \
--to=willemdebruijn.kernel@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=willemb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).