netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, Willem de Bruijn <willemb@google.com>
Subject: [PATCH net-next v2 1/2] udp: msg_zerocopy
Date: Mon, 26 Nov 2018 10:29:38 -0500	[thread overview]
Message-ID: <20181126152939.258443-2-willemdebruijn.kernel@gmail.com> (raw)
In-Reply-To: <20181126152939.258443-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and
interpret flag MSG_ZEROCOPY.

This patch was previously part of the zerocopy RFC patchsets. Zerocopy
is not effective at small MTU. With segmentation offload building
larger datagrams, the benefit of page flipping outweights the cost of
generating a completion notification.

The datagram implementation has initial refcnt of 0, instead of 1 for
TCP. The tcp_sendmsg_locked function has to hold a reference on
uarg independent those held by the skbs it generates, because the
skbs can be sent and freed in the function main loop. This is not
needed for other sockets.

tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on
test patch and making skb_orphan_frags_rx same as skb_orphan_frags:

    ipv4 udp -t 1
    tx=191312 (11938 MB) txc=0 zc=n
    rx=191312 (11938 MB)
    ipv4 udp -z -t 1
    tx=304507 (19002 MB) txc=304507 zc=y
    rx=304507 (19002 MB)
    ok
    ipv6 udp -t 1
    tx=174485 (10888 MB) txc=0 zc=n
    rx=174485 (10888 MB)
    ipv6 udp -z -t 1
    tx=294801 (18396 MB) txc=294801 zc=y
    rx=294801 (18396 MB)
    ok

Changes v1 -> v2
  - Fixup reverse christmas tree violation

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 15 ++++++++++++++-
 net/core/sock.c        |  5 ++++-
 net/ipv4/ip_output.c   | 22 +++++++++++++++++++++-
 net/ipv6/ip6_output.c  | 22 +++++++++++++++++++++-
 5 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 73902acf2b71..04f52e719571 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
 
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
 			     struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 02cd7ae3d0fb..2179ef84bb44 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	refcount_set(&uarg->refcnt, 1);
+	refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
 	sock_hold(sk);
 
 	return uarg;
@@ -1097,6 +1097,13 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
+		/* Stream socks hold a ref for the syscall, as skbs can be sent
+		 * and freed inside the loop, dropping refcnt to 0 inbetween.
+		 * Datagrams do not need this, but sock_zerocopy_put expects it.
+		 */
+		if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
+			refcount_set(&uarg->refcnt, 1);
+
 		sock_zerocopy_put(uarg);
 	}
 }
@@ -1105,6 +1112,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
 extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 				   struct iov_iter *from, size_t length);
 
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
+{
+	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
+
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
 			     struct ubuf_info *uarg)
diff --git a/net/core/sock.c b/net/core/sock.c
index 6d7e189e3cd9..f5bb89785e47 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 
 	case SO_ZEROCOPY:
 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
-			if (sk->sk_protocol != IPPROTO_TCP)
+			if (!((sk->sk_type == SOCK_STREAM &&
+			       sk->sk_protocol == IPPROTO_TCP) ||
+			      (sk->sk_type == SOCK_DGRAM &&
+			       sk->sk_protocol == IPPROTO_UDP)))
 				ret = -ENOTSUPP;
 		} else if (sk->sk_family != PF_RDS) {
 			ret = -ENOTSUPP;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c09219e7f230..7504da2f33d6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
 			    unsigned int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct ubuf_info *uarg = NULL;
 	struct sk_buff *skb;
 
 	struct ip_options *opt = cork->opt;
@@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk,
 	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
 		csummode = CHECKSUM_PARTIAL;
 
+	if (flags & MSG_ZEROCOPY && length) {
+		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		if (!uarg)
+			return -ENOBUFS;
+		if (rt->dst.dev->features & NETIF_F_SG &&
+		    csummode == CHECKSUM_PARTIAL) {
+			paged = true;
+		} else {
+			uarg->zerocopy = 0;
+			skb_zcopy_set(skb, uarg);
+		}
+	}
+
 	cork->length += length;
 
 	/* So, what's going on in the loop below?
@@ -1005,6 +1019,7 @@ static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
+			skb_zcopy_set(skb, uarg);
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1067,7 +1082,7 @@ static int __ip_append_data(struct sock *sk,
 				err = -EFAULT;
 				goto error;
 			}
-		} else {
+		} else if (!uarg || !uarg->zerocopy) {
 			int i = skb_shinfo(skb)->nr_frags;
 
 			err = -ENOMEM;
@@ -1097,6 +1112,10 @@ static int __ip_append_data(struct sock *sk,
 			skb->data_len += copy;
 			skb->truesize += copy;
 			wmem_alloc_delta += copy;
+		} else {
+			err = skb_zerocopy_iter_dgram(skb, from, copy);
+			if (err < 0)
+				goto error;
 		}
 		offset += copy;
 		length -= copy;
@@ -1109,6 +1128,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89e0d5118afe..8a49ff985ed8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
 {
 	struct sk_buff *skb, *skb_prev = NULL;
 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
+	struct ubuf_info *uarg = NULL;
 	int exthdrlen = 0;
 	int dst_exthdrlen = 0;
 	int hh_len;
@@ -1322,6 +1323,19 @@ static int __ip6_append_data(struct sock *sk,
 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
 		csummode = CHECKSUM_PARTIAL;
 
+	if (flags & MSG_ZEROCOPY && length) {
+		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		if (!uarg)
+			return -ENOBUFS;
+		if (rt->dst.dev->features & NETIF_F_SG &&
+		    csummode == CHECKSUM_PARTIAL) {
+			paged = true;
+		} else {
+			uarg->zerocopy = 0;
+			skb_zcopy_set(skb, uarg);
+		}
+	}
+
 	/*
 	 * Let's try using as much space as possible.
 	 * Use MTU if total length of the message fits into the MTU.
@@ -1443,6 +1457,7 @@ static int __ip6_append_data(struct sock *sk,
 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
+			skb_zcopy_set(skb, uarg);
 			tskey = 0;
 
 			/*
@@ -1505,7 +1520,7 @@ static int __ip6_append_data(struct sock *sk,
 				err = -EFAULT;
 				goto error;
 			}
-		} else {
+		} else if (!uarg || !uarg->zerocopy) {
 			int i = skb_shinfo(skb)->nr_frags;
 
 			err = -ENOMEM;
@@ -1535,6 +1550,10 @@ static int __ip6_append_data(struct sock *sk,
 			skb->data_len += copy;
 			skb->truesize += copy;
 			wmem_alloc_delta += copy;
+		} else {
+			err = skb_zerocopy_iter_dgram(skb, from, copy);
+			if (err < 0)
+				goto error;
 		}
 		offset += copy;
 		length -= copy;
@@ -1547,6 +1566,7 @@ static int __ip6_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
-- 
2.20.0.rc0.387.gc7a69e6b6c-goog

  reply	other threads:[~2018-11-27  2:24 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-26 15:29 [PATCH net-next v2 0/2] udp msg_zerocopy Willem de Bruijn
2018-11-26 15:29 ` Willem de Bruijn [this message]
2018-11-26 16:32   ` [PATCH net-next v2 1/2] udp: msg_zerocopy Paolo Abeni
2018-11-26 17:59     ` Willem de Bruijn
2018-11-26 18:04       ` Paolo Abeni
2018-11-26 18:19         ` Willem de Bruijn
2018-11-26 19:49           ` Willem de Bruijn
2018-11-28 23:50             ` Willem de Bruijn
2018-11-29  8:27               ` Paolo Abeni
2018-11-29 16:17                 ` Willem de Bruijn
2018-11-29  7:31   ` [udp] a4a142d3d7: WARNING:at_lib/refcount.c:#refcount_inc_checked kernel test robot
2018-11-26 15:29 ` [PATCH net-next v2 2/2] selftests: extend zerocopy tests to udp Willem de Bruijn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181126152939.258443-2-willemdebruijn.kernel@gmail.com \
    --to=willemdebruijn.kernel@gmail.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).