From: Paolo Abeni <pabeni@redhat.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Cc: Network Development <netdev@vger.kernel.org>,
David Miller <davem@davemloft.net>,
Willem de Bruijn <willemb@google.com>
Subject: Re: [PATCH net-next v2 1/2] udp: msg_zerocopy
Date: Thu, 29 Nov 2018 09:27:36 +0100 [thread overview]
Message-ID: <4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com> (raw)
In-Reply-To: <CAF=yD-L4bBm_obmsW5m7aTGwFv8uC-hDA7zo9qsAUr0y32Rr0Q@mail.gmail.com>
Hi,
Thank you for the update!
On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
> I did revert to the basic implementation using an extra ref
> for the function call, similar to TCP, as you suggested.
>
> On top of that as a separate optimization patch I have a
> variant that uses refcnt zero by replacing refcount_inc with
> refcount_set(.., refcount_read(..) + 1). Not very pretty.
If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):
---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
refcount_inc(&uarg->refcnt);
}
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+ if (uarg)
+ refcount_set(&uarg->refcnt, cnt);
+}
+
void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
return is_zcopy ? skb_uarg(skb) : NULL;
}
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
{
if (skb && uarg && !skb_zcopy(skb)) {
- sock_zerocopy_get(uarg);
skb_shinfo(skb)->destructor_arg = uarg;
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+ return 1;
}
+ return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+ if (__skb_zcopy_set(skb, uarg))
+ sock_zerocopy_get(uarg);
}
static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+ refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey);
uarg->len--;
- /* Stream socks hold a ref for the syscall, as skbs can be sent
- * and freed inside the loop, dropping refcnt to 0 inbetween.
- * Datagrams do not need this, but sock_zerocopy_put expects it.
- */
- if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
- refcount_set(&uarg->refcnt, 1);
-
sock_zerocopy_put(uarg);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
+ int uarg_refs = 0;
bool paged;
skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
if (flags & MSG_ZEROCOPY && length) {
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg_refs = 1;
if (!uarg)
return -ENOBUFS;
if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
paged = true;
} else {
uarg->zerocopy = 0;
- skb_zcopy_set(skb, uarg);
+ uarg_refs += __skb_zcopy_set(skb, uarg);
}
}
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
- skb_zcopy_set(skb, uarg);
+ uarg_refs += __skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
length -= copy;
}
+ __sock_zerocopy_init(uarg, uarg_refs);
if (wmem_alloc_delta)
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ __sock_zerocopy_init(uarg, uarg_refs);
sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.
WDYT?
Thanks,
Paolo
next prev parent reply other threads:[~2018-11-29 19:32 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-11-26 15:29 [PATCH net-next v2 0/2] udp msg_zerocopy Willem de Bruijn
2018-11-26 15:29 ` [PATCH net-next v2 1/2] udp: msg_zerocopy Willem de Bruijn
2018-11-26 16:32 ` Paolo Abeni
2018-11-26 17:59 ` Willem de Bruijn
2018-11-26 18:04 ` Paolo Abeni
2018-11-26 18:19 ` Willem de Bruijn
2018-11-26 19:49 ` Willem de Bruijn
2018-11-28 23:50 ` Willem de Bruijn
2018-11-29 8:27 ` Paolo Abeni [this message]
2018-11-29 16:17 ` Willem de Bruijn
2018-11-29 7:31 ` [udp] a4a142d3d7: WARNING:at_lib/refcount.c:#refcount_inc_checked kernel test robot
2018-11-26 15:29 ` [PATCH net-next v2 2/2] selftests: extend zerocopy tests to udp Willem de Bruijn
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com \
--to=pabeni@redhat.com \
--cc=davem@davemloft.net \
--cc=netdev@vger.kernel.org \
--cc=willemb@google.com \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).