netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Paolo Abeni <pabeni@redhat.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Cc: Network Development <netdev@vger.kernel.org>,
	David Miller <davem@davemloft.net>,
	Willem de Bruijn <willemb@google.com>
Subject: Re: [PATCH net-next v2 1/2] udp: msg_zerocopy
Date: Thu, 29 Nov 2018 09:27:36 +0100	[thread overview]
Message-ID: <4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com> (raw)
In-Reply-To: <CAF=yD-L4bBm_obmsW5m7aTGwFv8uC-hDA7zo9qsAUr0y32Rr0Q@mail.gmail.com>

Hi,

Thank you for the update!

On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
> I did revert to the basic implementation using an extra ref
> for the function call, similar to TCP, as you suggested.
> 
> On top of that as a separate optimization patch I have a
> variant that uses refcnt zero by replacing refcount_inc with
> refcount_set(.., refcount_read(..) + 1). Not very pretty.

If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):

---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 	refcount_inc(&uarg->refcnt);
 }
 
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+	if (uarg)
+		refcount_set(&uarg->refcnt, cnt);
+}
+
 void sock_zerocopy_put(struct ubuf_info *uarg);
 void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 {
 	if (skb && uarg && !skb_zcopy(skb)) {
-		sock_zerocopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
 		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+		return 1;
 	}
+	return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	if (__skb_zcopy_set(skb, uarg))
+		sock_zerocopy_get(uarg);
 }
 
 static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
-		/* Stream socks hold a ref for the syscall, as skbs can be sent
-		 * and freed inside the loop, dropping refcnt to 0 inbetween.
-		 * Datagrams do not need this, but sock_zerocopy_put expects it.
-		 */
-		if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
-			refcount_set(&uarg->refcnt, 1);
-
 		sock_zerocopy_put(uarg);
 	}
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
+	int uarg_refs = 0;
 	bool paged;
 
 	skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
 
 	if (flags & MSG_ZEROCOPY && length) {
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		uarg_refs = 1;
 		if (!uarg)
 			return -ENOBUFS;
 		if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 		}
 	}
 
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
 		length -= copy;
 	}
 
+	__sock_zerocopy_init(uarg, uarg_refs);
 	if (wmem_alloc_delta)
 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	__sock_zerocopy_init(uarg, uarg_refs);
 	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.

WDYT?

Thanks,

Paolo

  reply	other threads:[~2018-11-29 19:32 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-26 15:29 [PATCH net-next v2 0/2] udp msg_zerocopy Willem de Bruijn
2018-11-26 15:29 ` [PATCH net-next v2 1/2] udp: msg_zerocopy Willem de Bruijn
2018-11-26 16:32   ` Paolo Abeni
2018-11-26 17:59     ` Willem de Bruijn
2018-11-26 18:04       ` Paolo Abeni
2018-11-26 18:19         ` Willem de Bruijn
2018-11-26 19:49           ` Willem de Bruijn
2018-11-28 23:50             ` Willem de Bruijn
2018-11-29  8:27               ` Paolo Abeni [this message]
2018-11-29 16:17                 ` Willem de Bruijn
2018-11-29  7:31   ` [udp] a4a142d3d7: WARNING:at_lib/refcount.c:#refcount_inc_checked kernel test robot
2018-11-26 15:29 ` [PATCH net-next v2 2/2] selftests: extend zerocopy tests to udp Willem de Bruijn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com \
    --to=pabeni@redhat.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=willemb@google.com \
    --cc=willemdebruijn.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).